In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from nltk.corpus import stopwords
import pandas as pd
import re

In [2]:
train_df = pd.read_csv('./Assignment 1 Task 2/train.csv')
test_df = pd.read_csv('./Assignment 1 Task 2/test.csv')

In [3]:
train_df,test_df

(                                                 review sentiment
 0     One of the other reviewers has mentioned that ...  positive
 1     A wonderful little production. <br /><br />The...  positive
 2     I thought this was a wonderful way to spend ti...  positive
 3     Basically there's a family where a little boy ...  negative
 4     Petter Mattei's "Love in the Time of Money" is...  positive
 ...                                                 ...       ...
 1995  Feeling Minnesota, directed by Steven Baigelma...  negative
 1996  THE CELL (2000) Rating: 8/10<br /><br />The Ce...  positive
 1997  This movie, despite its list of B, C, and D li...  negative
 1998  I loved this movie! It was all I could do not ...  positive
 1999  This was the worst movie I have ever seen Bill...  negative
 
 [2000 rows x 2 columns],
                                                 review sentiment
 0    Stranded in Space (1972) MST3K version - a ver...  negative
 1    - After their sons are sentenc

                                                Preprocessing Data

                                                Removing BreakLine Tokens

In [4]:
#Parameter 
# input: txt: string
# output: txt: string (Removed <br /> tokens)
def Remove_BreakLine_Token(txt):
    new_txt = re.sub('<br />',' ',txt)
    return new_txt

In [5]:
train_df['review'] = train_df['review'].apply(Remove_BreakLine_Token)

In [6]:
test_df['review'] = test_df['review'].apply(Remove_BreakLine_Token)

                                                    Labeling Target Vector

In [7]:
le = LabelEncoder()

In [8]:
train_df['sentiment'] = le.fit_transform(train_df['sentiment'])
test_df['sentiment'] = le.fit_transform(test_df['sentiment'])

In [9]:
train_df['sentiment'],test_df['sentiment']

(0       1
 1       1
 2       1
 3       0
 4       1
        ..
 1995    0
 1996    1
 1997    0
 1998    1
 1999    0
 Name: sentiment, Length: 2000, dtype: int32,
 0      0
 1      1
 2      1
 3      1
 4      1
       ..
 495    0
 496    1
 497    1
 498    0
 499    1
 Name: sentiment, Length: 500, dtype: int32)

In [10]:
X_train = train_df.drop('sentiment',axis=1).values.flatten()
y_train = train_df['sentiment'].values
print('Train shape: ',X_train.shape)
stop_words = set(stopwords.words('english'))

Train shape:  (2000,)


In [11]:
#Seperate texts and labels on test set
X_test = test_df.drop('sentiment',axis=1).values.flatten()
y_labels = test_df['sentiment'].values
print("Test shape: ",X_test.shape)

Test shape:  (500,)


In [12]:
pipeline = Pipeline([('Count',CountVectorizer(analyzer='word',stop_words=stop_words)),('selector', SelectKBest(mutual_info_classif, k=1000))])

In [13]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('Count',
                 CountVectorizer(stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('selector',
                 SelectKBest(k=1000,
                             score_func=<function mutual_info_classif at 0x0000021A40D59E50>))])

In [14]:
test_vector = pipeline.transform(X_test)

In [15]:
test_vector.shape

(500, 1000)

In [16]:
feature_name = pipeline.named_steps['Count'].get_feature_names()

In [17]:
selected_feature = pipeline.named_steps['selector'].get_support()

In [18]:
scores = pipeline.named_steps['selector'].scores_

In [19]:
scores.shape,len(pipeline.named_steps['Count'].vocabulary_)

((24992,), 24992)

In [20]:
features_score = zip(feature_name,scores)

In [21]:
Top_features = sorted(features_score,key=lambda x:x[1],reverse=True)

In [22]:
Top_features

[('bad', 0.04811816138029536),
 ('worst', 0.035865853274491516),
 ('waste', 0.02561728652160188),
 ('great', 0.024804940942942305),
 ('awful', 0.02478941032247474),
 ('excellent', 0.023609835299353583),
 ('stupid', 0.016998440671858274),
 ('best', 0.01670521791667689),
 ('love', 0.016604638223308994),
 ('terrible', 0.01633976832817613),
 ('boring', 0.016286400974158795),
 ('movie', 0.015512997397162567),
 ('wonderful', 0.014490169893036552),
 ('money', 0.013034660209581146),
 ('worse', 0.012895873954912826),
 ('even', 0.012786942249107022),
 ('loved', 0.0121349246228388),
 ('horrible', 0.011188777816465469),
 ('perfect', 0.0105663234248614),
 ('crap', 0.010085277501928034),
 ('still', 0.009976194973681326),
 ('wasted', 0.009881844473058751),
 ('minutes', 0.009879608819790865),
 ('nothing', 0.009701371849963069),
 ('poor', 0.009677737692860837),
 ('plot', 0.00956297439912669),
 ('lame', 0.009157078988191977),
 ('acting', 0.009143249615731024),
 ('amazing', 0.008984012660617765),
 ('espe

In [23]:
important_features = [feature_name for feature_name,score in Top_features]

In [24]:
important_features

['bad',
 'worst',
 'waste',
 'great',
 'awful',
 'excellent',
 'stupid',
 'best',
 'love',
 'terrible',
 'boring',
 'movie',
 'wonderful',
 'money',
 'worse',
 'even',
 'loved',
 'horrible',
 'perfect',
 'crap',
 'still',
 'wasted',
 'minutes',
 'nothing',
 'poor',
 'plot',
 'lame',
 'acting',
 'amazing',
 'especially',
 'film',
 'brilliant',
 'supposed',
 'instead',
 'performances',
 'tedious',
 'made',
 'ridiculous',
 'would',
 'world',
 'poorly',
 'young',
 'beautiful',
 'well',
 'favorite',
 'anything',
 'life',
 'gem',
 'obviously',
 'could',
 'greatest',
 'bored',
 'save',
 'always',
 'make',
 'unless',
 'also',
 'cheap',
 'decent',
 'years',
 'animated',
 'whatsoever',
 'role',
 'laughable',
 'ok',
 'script',
 'idiotic',
 'classic',
 'dull',
 'treat',
 'strong',
 'actually',
 'garbage',
 'least',
 'brilliantly',
 'terrific',
 'memorable',
 'fantastic',
 'definitely',
 'sucks',
 'annoying',
 'little',
 'hell',
 'actors',
 'shows',
 'predictable',
 'guy',
 'unique',
 'god',
 'real

In [25]:
def Bag_of_word(df,df2):
    #Seperate texts and labels on train set
    test_1 = df['review']
    labels_1 = df['sentiment']
    print('Train shape: ',test_1.shape)

    #Seperate texts and labels on test set
    test_2 = df2.drop('sentiment',axis=1).values.flatten()
    labels_2 = df2['sentiment'].values
    print("Test shape: ",test_2.shape)

    #Initializing stop words
    stop_words = set(stopwords.words('english'))

    #Initialize TfidfVectorizer object
    vectorizer = CountVectorizer(analyzer='word',stop_words=stop_words)

    # Fit the data (This wil tokenize and build a vocabulary)
    vector_1 = vectorizer.fit_transform(test_1)
    print(vector_1)

    #Fit the data on the test set
    vector_2 = vectorizer.transform(test_2)

    # return both of the vectors
    return vector_1,labels_1,vector_2,labels_2,vectorizer



In [26]:
train_vectors,train_labels,test_vectors,test_labels,vectorizer = Bag_of_word(train_df,test_df)

Train shape:  (2000,)
Test shape:  (500,)
  (0, 15537)	1
  (0, 18564)	1
  (0, 14061)	1
  (0, 24214)	2
  (0, 15870)	6
  (0, 7561)	2
  (0, 10670)	1
  (0, 18669)	2
  (0, 7770)	1
  (0, 10070)	1
  (0, 8464)	2
  (0, 22356)	1
  (0, 21396)	2
  (0, 3075)	1
  (0, 23338)	1
  (0, 19341)	1
  (0, 23924)	4
  (0, 19745)	1
  (0, 24664)	2
  (0, 9466)	1
  (0, 22989)	1
  (0, 20021)	3
  (0, 8059)	1
  (0, 10267)	1
  (0, 22515)	1
  :	:
  (1999, 16805)	1
  (1999, 5072)	1
  (1999, 8162)	1
  (1999, 24750)	1
  (1999, 2514)	1
  (1999, 3144)	1
  (1999, 20022)	1
  (1999, 4388)	1
  (1999, 13188)	1
  (1999, 3118)	2
  (1999, 23078)	1
  (1999, 23260)	1
  (1999, 19516)	4
  (1999, 17891)	1
  (1999, 21892)	1
  (1999, 2403)	3
  (1999, 9542)	1
  (1999, 17899)	1
  (1999, 9880)	1
  (1999, 4491)	1
  (1999, 20334)	1
  (1999, 24915)	1
  (1999, 15070)	1
  (1999, 11628)	1
  (1999, 4385)	2


In [27]:
train_dataFrame = pd.DataFrame(train_vectors.todense(),columns=vectorizer.get_feature_names())

In [28]:
train_dataFrame

Unnamed: 0,00,000,007,00am,01pm,02,04,06,07,08,...,zwick,zzzzzzzzzzzzzzzzzz,álvaro,ángel,æon,élan,être,ís,ísnt,île
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
vectorizer.vocabulary_

{'one': 15537,
 'reviewers': 18564,
 'mentioned': 14061,
 'watching': 24214,
 'oz': 15870,
 'episode': 7561,
 'hooked': 10670,
 'right': 18669,
 'exactly': 7770,
 'happened': 10070,
 'first': 8464,
 'thing': 22356,
 'struck': 21396,
 'brutality': 3075,
 'unflinching': 23338,
 'scenes': 19341,
 'violence': 23924,
 'set': 19745,
 'word': 24664,
 'go': 9466,
 'trust': 22989,
 'show': 20021,
 'faint': 8059,
 'hearted': 10267,
 'timid': 22515,
 'pulls': 17463,
 'punches': 17478,
 'regards': 18103,
 'drugs': 6899,
 'sex': 19776,
 'hardcore': 10092,
 'classic': 4189,
 'use': 23625,
 'called': 3340,
 'nickname': 15106,
 'given': 9385,
 'oswald': 15695,
 'maximum': 13852,
 'security': 19560,
 'state': 21107,
 'penitentary': 16285,
 'focuses': 8657,
 'mainly': 13497,
 'emerald': 7323,
 'city': 4147,
 'experimental': 7894,
 'section': 19554,
 'prison': 17193,
 'cells': 3717,
 'glass': 9407,
 'fronts': 8958,
 'face': 8020,
 'inwards': 11722,
 'privacy': 17199,
 'high': 10455,
 'agenda': 697,
 'em'

                                                Feature Importance

In [30]:
def Feature_importance(X,y):
    selector = SelectKBest(mutual_info_classif,k='all')
    x_reduced = selector.fit_transform(X,y)
    print(x_reduced.shape)    
    cols = selector.get_support(indices=True)
    selected_columns = X.iloc[:,cols].columns.tolist()
    print(selected_columns)
    


In [31]:
train_vectors

<2000x24992 sparse matrix of type '<class 'numpy.int64'>'
	with 191958 stored elements in Compressed Sparse Row format>

In [32]:
Feature_importance(train_vectors,train_labels)