In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
import nltk
import re

# download the necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hazim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hazim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hazim\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hazim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train_df = pd.read_csv('../Assignment 1 Task 2/train.csv')
test_df = pd.read_csv('../Assignment 1 Task 2/test.csv')

In [4]:
train_df,test_df

(                                                 review sentiment
 0     One of the other reviewers has mentioned that ...  positive
 1     A wonderful little production. <br /><br />The...  positive
 2     I thought this was a wonderful way to spend ti...  positive
 3     Basically there's a family where a little boy ...  negative
 4     Petter Mattei's "Love in the Time of Money" is...  positive
 ...                                                 ...       ...
 1995  Feeling Minnesota, directed by Steven Baigelma...  negative
 1996  THE CELL (2000) Rating: 8/10<br /><br />The Ce...  positive
 1997  This movie, despite its list of B, C, and D li...  negative
 1998  I loved this movie! It was all I could do not ...  positive
 1999  This was the worst movie I have ever seen Bill...  negative
 
 [2000 rows x 2 columns],
                                                 review sentiment
 0    Stranded in Space (1972) MST3K version - a ver...  negative
 1    - After their sons are sentenc

                                                Preprocessing Data

                                            Removing BreakLine Tokens

In [5]:
#Parameter 
# input: txt: string
# output: txt: string (Removed <br /> tokens)
def Remove_BreakLine_Token(txt):
    new_txt = re.sub('<br />',' ',txt)
    return new_txt

In [6]:
train_df['review'] = train_df['review'].apply(Remove_BreakLine_Token)

In [7]:
test_df['review'] = test_df['review'].apply(Remove_BreakLine_Token)

                                                Lemmentizing

In [12]:
def Preprocessing(text):
    # create a lemmatizer object
    lemmatizer = WordNetLemmatizer()


    # tokenize the text
    tokens = nltk.word_tokenize(text)


    # stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

    # lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

In [13]:
train_df['review'] = train_df['review'].apply(Preprocessing)
test_df['review'] = test_df['review'].apply(Preprocessing)

                                                Labeling Target Vector

In [14]:
le = LabelEncoder()

In [15]:
train_df['sentiment'] = le.fit_transform(train_df['sentiment'])
test_df['sentiment'] = le.fit_transform(test_df['sentiment'])

In [16]:
train_df['sentiment'],test_df['sentiment']

(0       1
 1       1
 2       1
 3       0
 4       1
        ..
 1995    0
 1996    1
 1997    0
 1998    1
 1999    0
 Name: sentiment, Length: 2000, dtype: int64,
 0      0
 1      1
 2      1
 3      1
 4      1
       ..
 495    0
 496    1
 497    1
 498    0
 499    1
 Name: sentiment, Length: 500, dtype: int64)

In [17]:
#Seperate texts and labels on train set
X_train = train_df.drop('sentiment',axis=1).values.flatten()
y_train = train_df['sentiment'].values
print('Train shape: ',X_train.shape)
stop_words = set(stopwords.words('english'))

Train shape:  (2000,)


In [18]:
#Seperate texts and labels on test set
X_test = test_df.drop('sentiment',axis=1).values.flatten()
y_test = test_df['sentiment'].values
print("Test shape: ",X_test.shape)

Test shape:  (500,)


In [38]:
vectorizer = CountVectorizer(binary=True)
selector = SelectKBest(mutual_info_classif, k=2000)

In [39]:
# This create a pipeline with two steps. 
# At the first step it will learn the vacabulary and then we get extract important features
pipeline = Pipeline([('Vect',vectorizer),('selector', selector)])

In [40]:
#This will execute all the fit function in a step wise manner
train_vector = pipeline.fit_transform(X_train,y_train)

In [41]:
#This will execute only transform function of all those object that have this functions in a step wise manner 
test_vector = pipeline.transform(X_test)

In [42]:
test_vector.shape,train_vector.shape

((500, 2000), (2000, 2000))

In [43]:
#Extracting feature names by specifing the step name and then calling the function
feature_name = pipeline.named_steps['Vect'].get_feature_names()

In [44]:
scores = pipeline.named_steps['selector'].scores_

In [45]:
scores.shape,len(pipeline.named_steps['Vect'].vocabulary_)

((23284,), 23284)

In [46]:
features_score = zip(feature_name,scores)

In [47]:
Top_features = sorted(features_score,key=lambda x:x[1],reverse=True)

In [48]:
Top_features

[('bad', 0.04383129908094724),
 ('worst', 0.0352071773493042),
 ('waste', 0.024560727973535194),
 ('awful', 0.02388958384700378),
 ('excellent', 0.022970233669165688),
 ('great', 0.020933863096908145),
 ('terrible', 0.016233807555006412),
 ('boring', 0.01594761489467502),
 ('no', 0.015925364336550257),
 ('stupid', 0.015620197706580026),
 ('wonderful', 0.014135888741000253),
 ('love', 0.01376215511774137),
 ('worse', 0.012610751121030633),
 ('money', 0.01231399064673152),
 ('loved', 0.011721237536249554),
 ('best', 0.011282944289687596),
 ('even', 0.010553724268789838),
 ('perfect', 0.010506121042139459),
 ('horrible', 0.010221021198911415),
 ('wasted', 0.009743761184206962),
 ('just', 0.009232288934351518),
 ('lame', 0.00903189761382594),
 ('amazing', 0.00889370256247772),
 ('poor', 0.008814883530574497),
 ('crap', 0.008728899712759385),
 ('nothing', 0.008674598886925747),
 ('could', 0.008483308274926021),
 ('especially', 0.008398089730666164),
 ('supposed', 0.008396121552421441),
 ('b

In [49]:
important_features = [feature_name for feature_name,score in Top_features]

In [50]:
# Top 10 features
print('Top 10 Features: ')
for x in range(10):
    print(important_features[x])

Top 10 Features: 
bad
worst
waste
awful
excellent
great
terrible
boring
no
stupid


In [51]:
# Least 10 important features
length = len(important_features)-1
print('Least important 10 features: ')
for x in range(10):
    print(important_features[length-x])

Least important 10 features: 
seen
zodiac
zinger
zhang
zemeckis
zealand
zaniness
zane
zach
yuma


In [None]:
train_df.to_csv('preprocessed_train.csv',index=False)
test_df.to_csv('preprocessed_test.csv',index=False)