In [2]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Mounted at /gdrive
/gdrive


In [3]:
import pandas as pd
data = pd.read_csv('/gdrive/My Drive/DecisionTree/archive/Train.csv')
data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
data['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [5]:
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
def normalize(review, lowercase, remove_stopwords):
    if lowercase:
        review = review.lower()
    doc = nlp(review)
    lemmatized = list()
    for token in doc:
        if not remove_stopwords or (remove_stopwords and not token.is_stop):
            lemmatized.append(token.lemma_)
    return " ".join(lemmatized)
data['processed'] = data['text'].apply(normalize, lowercase=True, remove_stopwords=True)

**Preprocess**

In [6]:
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=10)


In [7]:
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize, max_features = 1000)
X_train_vect = cv.fit_transform(X_train)
X_train_vect.shape

(32000, 1000)

In [8]:
df = pd.DataFrame(X_train_vect.toarray(), columns=cv.get_feature_names())
df.head()



Unnamed: 0,1,10,15,2,20,3,30,4,5,50,...,wrong,wrote,yeah,year,years,yes,york,young,younger,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X_test_vect= cv.transform(X_test)
X_test_vect.shape

(8000, 1000)

In [10]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train_vect, Y_train)

MultinomialNB()

In [11]:
from sklearn import metrics
predicted = MNB.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      4010
           1       0.82      0.84      0.83      3990

    accuracy                           0.83      8000
   macro avg       0.83      0.83      0.83      8000
weighted avg       0.83      0.83      0.83      8000



In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train_vect, Y_train)

SVC()

In [17]:
from sklearn import metrics
predicted = clf.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4010
           1       0.85      0.88      0.86      3990

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000



**VaderSentiment**

In [13]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/126.0 KB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.downloader.download('vader_lexicon')
sentiment = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [18]:
v_predicted = []
for text in X_test: 
  sent= sentiment.polarity_scores(text)
  if sent['compound']>0: 
    v_predicted.append(1)
  else:
    v_predicted.append(0)

In [19]:
v_performance = metrics.classification_report(Y_test,v_predicted, target_names= ['0', '1'])
print(v_performance)

              precision    recall  f1-score   support

           0       0.79      0.54      0.64      4010
           1       0.65      0.85      0.74      3990

    accuracy                           0.70      8000
   macro avg       0.72      0.70      0.69      8000
weighted avg       0.72      0.70      0.69      8000



**TFIDF**

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
vectorizer = TfidfVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize, max_features = 1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape


(32000, 1000)

In [22]:
df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names())
df.head()



Unnamed: 0,1,10,15,2,20,3,30,4,5,50,...,wrong,wrote,yeah,year,years,yes,york,young,younger,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.096121,0.078394,0.0,0.128229,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.174893,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13044,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_test_tfidf= vectorizer.transform(X_test)
X_test_tfidf.shape

(8000, 1000)

In [24]:
MNB = MultinomialNB()
MNB.fit(X_train_tfidf, Y_train)
predicted = MNB.predict(X_test_tfidf)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      4010
           1       0.83      0.86      0.84      3990

    accuracy                           0.84      8000
   macro avg       0.84      0.84      0.84      8000
weighted avg       0.84      0.84      0.84      8000



In [25]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train_tfidf, Y_train)

SVC()

In [26]:
from sklearn import metrics
predicted = clf.predict(X_test_tfidf)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4010
           1       0.86      0.87      0.87      3990

    accuracy                           0.87      8000
   macro avg       0.87      0.87      0.87      8000
weighted avg       0.87      0.87      0.87      8000

