# Text classification practice
#### *Ekaterina Chunosova*

In [2]:
# Importing the libraries
import pandas as pd
import numpy as np
import glob

In [3]:
# Getting file names
train_neg_files = glob.glob("train/neg/*.txt")
train_pos_files = glob.glob("train/pos/*.txt")
test_neg_files = glob.glob("test/neg/*.txt")
test_pos_files = glob.glob("test/pos/*.txt")

In [4]:
# Reading the files and appending into the list
train_neg = []
for filename in train_neg_files:
    with open(filename, 'r') as filename:
        file = filename.read()
        train_neg.append(file)

In [5]:
# Reading the files and appending into the list
train_pos = []
for filename in train_pos_files:
    with open(filename, 'r') as filename:
        file = filename.read()
        train_pos.append(file)

In [6]:
# Creating dataframe with negative reviews
train_neg_df = pd.DataFrame({ "review": train_neg, 
                           "sentiment": 0
})
train_neg_df.head()

Unnamed: 0,review,sentiment
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0


In [7]:
# Creating dataframe with positive reviews
train_pos_df = pd.DataFrame({ "review": train_pos, 
                           "sentiment": 1
})
train_pos_df.head()

Unnamed: 0,review,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


In [15]:
# Concatenating positive and negative reviews, checking train set
train = pd.concat([train_pos_df,train_neg_df])
train

Unnamed: 0,review,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
...,...,...
12495,"My comments may be a bit of a spoiler, for wha...",0
12496,"The ""saucy"" misadventures of four au pairs who...",0
12497,"Oh, those Italians! Assuming that movies about...",0
12498,Eight academy nominations? It's beyond belief....,0


Now we follow the same steps for the test set:

In [8]:
# Reading the files and appending into the list
test_neg = []
for filename in test_neg_files:
    with open(filename, 'r') as filename:
        file = filename.read()
        test_neg.append(file)

In [9]:
# Reading the files and appending into the list
test_pos = []
for filename in test_pos_files:
    with open(filename, 'r') as filename:
        file = filename.read()
        test_pos.append(file)

In [10]:
# Creating dataframe with negative reviews
test_neg_df = pd.DataFrame({ "review": test_neg, 
                           "sentiment": 0
})
test_neg_df.head()

Unnamed: 0,review,sentiment
0,Alan Rickman & Emma Thompson give good perform...,0
1,I have seen this movie and I did not care for ...,0
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0
3,"This film is bundled along with ""Gli fumavano ...",0
4,I only comment on really very good films and o...,0


In [11]:
# Creating dataframe with positive reviews
test_pos_df = pd.DataFrame({ "review": test_pos, 
                           "sentiment": 1
})
test_pos_df.head()

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [12]:
# Concatenating positive and negative reviews, checking test set
test = pd.concat([test_pos_df,test_neg_df])
test.head()

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [16]:
# Removing <br /> tags
train["review"] = train["review"].str.replace("<br />", "")
test["review"] = test["review"].str.replace("<br />", "")
train.head(10)

Unnamed: 0,review,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
5,I saw the movie with two grown children. Altho...,1
6,You're using the IMDb.You've given some hefty ...,1
7,This was a good film with a powerful message o...,1
8,"Made after QUARTET was, TRIO continued the qua...",1
9,"For a mature man, to admit that he shed a tear...",1


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
# Initialising a vectorizer. I decided not to use stop words since we are trying to conduct a sentiment analysis and some stop words may 
# actually indicate whether the review was positive or negative
vectorizer = TfidfVectorizer()
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [19]:
# Creating an TD-IDF for training
reviewVect = vectorizer.fit_transform(train['review']).toarray()

In [20]:
reviewVect

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [18]:
# Initialising a classifier
clf = LogisticRegression(solver='liblinear')

In [19]:
# Fitting the model
clf.fit(reviewVect, train['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
# Getting predictions
sentiment_pred = clf.predict(vectorizer.transform(test['review']).toarray())

In [21]:
from sklearn.metrics import confusion_matrix
#Getting accuracy, confusion matrix
print('Accuracy score, baseline, Logit: %s %%' %(100*accuracy_score(test['sentiment'],sentiment_pred)))
print('Confusion Matrix : \n' + str(confusion_matrix(test['sentiment'],sentiment_pred)))

Accuracy score, baseline, Logit: 88.288 %
Confusion Matrix : 
[[11051  1449]
 [ 1479 11021]]


In [22]:
# Concatenating test and train to optimize hyperparameters and perform cv
whole_data = pd.concat([train, test])
whole_data.shape

(50000, 2)

In [23]:
# Creating a sample of data for GridSearchCV
sample_whole_data = whole_data.sample(frac = 0.3, random_state = 42)

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
# Let's try a grid search with not so many parameters to see if it will improve the results:
grid_clf_acc = GridSearchCV(clf, 
                            param_grid = {'C':[0.1, 1, 10, 100, 1000]},
                            scoring = 'accuracy',  
                            cv = 5, 
                            n_jobs = 3) # with n_jobs = -1 my computer dies

In [26]:
# Fitting GridSearch
grid_clf_acc.fit(vectorizer.fit_transform(sample_whole_data['review']).toarray(), sample_whole_data['sentiment'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=3, param_grid={'C': [0.1, 1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [35]:
# Best C parameter and score
print("Best C parameter is %s" %grid_clf_acc.best_params_)
print("Mean CV accuracy with the best parameter is %s" %grid_clf_acc.best_score_)

Best C parameter is {'C': 10}
Mean CV accuracy with the best parameter is 0.8872666666666666


As could be noticed, the improvement is only 0.5%. Let's try a bigger range of parameters just for fun.

In [47]:
# Defining new parameter grid:
param_grid = {
    'C' : np.logspace(-4, 4, 20)}

In [42]:
grid_clf_acc_ = GridSearchCV(clf, 
                            param_grid = param_grid,
                            scoring = 'accuracy',  
                            cv = 5, 
                            n_jobs = 3)

In [45]:
# Fitting GridSearch
grid_clf_acc_.fit(vectorizer.fit_transform(sample_whole_data['review']).toarray(), sample_whole_data['sentiment'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=3,
             param_grid={'C': array([1.00000...04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.0000

In [46]:
# Best C parameter and score
print("Best C parameter is %s" %grid_clf_acc_.best_params_)
print("Mean CV accuracy with the best parameter is %s" %grid_clf_acc_.best_score_)

Best C parameter is {'C': 4.281332398719396}
Mean CV accuracy with the best parameter is 0.8876


After a very considerable time of running we got an improvement of 0.04%.