# Text Classification Full Implementation

![nlp](https://wrm5sysfkg-flywheel.netdna-ssl.com/wp-content/uploads/2019/01/NLP-Technology-in-Healthcare.jpg)

In [1]:
import numpy as np
import pandas as pd

## load the data

In [2]:
df = pd.read_csv('./resources/moviereviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


## Handle the null rows & clean data

In [4]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [7]:
# remove rows with faux blanks. Or rows that contain empty space. 
blanks = []

# i=index, lb=label value rv=review
for i, lb, rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)


In [9]:
len(blanks)

27

In [10]:
# drop the rows from blanks 
df.drop(blanks, inplace=True)

In [11]:
len(df)

1938

## Split to Training & Testing data sets

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x = df['review']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

## Build a pipeline to vectorize the data

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC 

In [15]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

## Fit the model

In [16]:
text_clf.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

## Make predictions 

In [17]:
preds = text_clf.predict(x_test)

## Metrics

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [19]:
print(confusion_matrix(y_test, preds))

[[235  47]
 [ 41 259]]


In [20]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582



In [21]:
print(accuracy_score(y_test, preds))

0.8487972508591065
