In [18]:
# Text Classification with fastText
# Importing libraries
import numpy as np, pandas as pd

# NLP Preprocessing
from gensim.utils import simple_preprocess

import csv
import fasttext

In [4]:
# Importing the dataset
dataset = pd.read_csv('train.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})
ds = pd.read_csv('valid.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})

In [6]:
ds.head()

Unnamed: 0,questions,category
0,I am having 4 different tables like \r\nselect...,LQ_EDIT
1,I have two table m_master and tbl_appointment\...,LQ_EDIT
2,<p>I'm trying to extract US states from wiki U...,HQ
3,"I'm so new to C#, I wanna make an application ...",LQ_EDIT
4,basically i have this array:\r\n\r\n array(...,LQ_EDIT


In [7]:
# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

In [8]:
ds.head()

Unnamed: 0,questions,category
0,am having different tables like select from sy...,LQ_EDIT
1,have two table m_master and tbl_appointment th...,LQ_EDIT
2,trying to extract us states from wiki url and ...,HQ
3,so new to wanna make an application that can e...,LQ_EDIT
4,basically have this array array array sub comp...,LQ_EDIT


In [9]:
# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [10]:
ds.head()

Unnamed: 0,questions,category
0,am having different tables like select from sy...,__label__LQ_EDIT
1,have two table m_master and tbl_appointment th...,__label__LQ_EDIT
2,trying to extract us states from wiki url and ...,__label__HQ
3,so new to wanna make an application that can e...,__label__LQ_EDIT
4,basically have this array array array sub comp...,__label__LQ_EDIT


In [15]:
# Saving the CSV file as a text file to train/test the classifier
dataset[['category', 'questions']].to_csv('train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

In [14]:
ds[['category', 'questions']].to_csv('test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [19]:
# Training the fastText classifier
model = fasttext.train_supervised('train.txt', wordNgrams = 2)

Read 5M words
Number of words:  136076
Number of labels: 3
Progress: 100.0% words/sec/thread:  618342 lr:  0.000000 avg.loss:  0.433807 ETA:   0h 0m 0s


In [20]:
# Evaluating performance on the entire test file
model.test('test.txt')                      

(15000, 0.8379333333333333, 0.8379333333333333)

In [21]:
# Predicting on a single input
model.predict(ds.iloc[2, 0])

(('__label__HQ',), array([0.94950062]))

In [22]:
# Save the trained model
model.save_model('model.bin')