In [None]:
# !pip install -q kaggle

In [None]:
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
# Imports

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk

import re

import gc

In [None]:
tweets = pd.read_csv('IMDB Dataset.csv', encoding='latin-1')

In [None]:
tweets.head()

In [None]:
# Target label encoding

le = LabelEncoder()
tweets['sentiment_encoded'] = le.fit_transform(tweets['sentiment'])
tweets.head(3)

### Text preprocessing

In [None]:
# base cleaning function: remove mentions, hashtags, digits, etc

def clean(text):
    text = text.lower()
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'<.*?>',' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    
    punctuation = ['""', '&', '...', '(', ')', ':', ',', '.', '!', "'", '/']
    for p in punctuation:
        text = text.replace(p, "")
        
    text = text.replace('  ', " ")
    text = text.replace('   ', " ")
    return text

In [None]:
text = tweets['review']
text.head()

In [None]:
# Apply base cleaning functions

text = text.apply(clean)
text.head(15)

In [None]:
# Stopwords: non-informative words
# Lemmatizer: returns base or dictionary word form
# Tokenizer: splitting text into tokens (words)
import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords

# from nltk.stem import 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [None]:
# uncomment
# stopwords.words()

Stopwords removing

In [None]:
# create lemmatizer object
wn_lemmatizer = WordNetLemmatizer()

In [None]:
dirty_tweet = text.iloc[520]

In [None]:
print(f'''
      ** Original Tweet **: \n \n      
      {dirty_tweet}
      
      ------------------------------------------------
      
      ** WordNetLemmatizer: ** \n \n
      {' '.join([wn_lemmatizer.lemmatize(word) for word in dirty_tweet.split()])}
      ''')

In [None]:
# Apply lemmatizer to tweets :
# 
lemmatized_text = []
for tweet in text:
    lemmatized_text.append(' '.join([wn_lemmatizer.lemmatize(word) for word in tweet.split()]))

In [None]:
# Create and apply tokenizer

reg_tokenizer = RegexpTokenizer('\w+')

In [None]:
tokenized_text = reg_tokenizer.tokenize_sents(lemmatized_text)
# tokenized_text[:2]

In [None]:
len(tokenized_text)

In [None]:
# tokenized_text

In [None]:
# cache stopwords in variable
# delete stopwords from tweets
# nltk.download('stopwords')
sw = stopwords.words()

In [None]:
clean_tokenized_tweets = [] 
for i, element in enumerate(tokenized_text):
    if i % 2000 == 0: print(i, end = ' ')
    clean_tokenized_tweets.append(' '.join([word for word in element if word not in sw]))

In [None]:
df = pd.concat([pd.Series(clean_tokenized_tweets, name='tweet'), 
                pd.Series(tweets['sentiment_encoded'], name='sentiment')], 
               axis=1)

In [None]:
df.to_csv('clean_tweets', index=False)

In [4]:
df = pd.read_csv('clean_tweets')
df

Unnamed: 0,tweet,sentiment
0,reviewer mentioned watching oz episode youll h...,1
1,wonderful little production filming technique ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically family little boy jake think zombie ...,0
4,petter matteis love time money visually stunni...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school ta...,0
49998,going disagree previous comment side maltin se...,0


`CountVectorizer`: make **bag of words** representation 

`TfidfVectorizer`: make **frequency based** representation

In [None]:
gc.collect()

In [None]:
# Create objects
cvec = CountVectorizer(ngram_range=(1, 2))
tfid = TfidfVectorizer(ngram_range=(1, 2))

In [8]:
cvec_representation = cvec.fit_transform(df['tweet'])
tfid_representation = tfid.fit_transform(df['tweet'])

In [13]:
cvec_representation.shape

(50000, 3041562)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(cvec_representation, 
                                                    df['sentiment'], test_size=0.2, random_state=900)

In [11]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((40000, 3041562), (10000, 3041562), (40000,), (10000,))

In [23]:
x_train_tfid, x_test_tfid, y_train_tfid, y_test_tfid = train_test_split(tfid_representation, 
                                                    df['sentiment'], test_size=0.2, random_state=900)

In [24]:
x_train_tfid.shape, x_test_tfid.shape, y_train_tfid.shape, y_test_tfid.shape

((40000, 3041562), (10000, 3041562), (40000,), (10000,))

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(cvec_representation, 
                                                    df['sentiment'], test_size=0.2, random_state=900)
x_train.shape, x_test.shape, y_train.shape, y_test.shape
x_train_tfid, x_test_tfid, y_train_tfid, y_test_tfid = train_test_split(tfid_representation, 
                                                    df['sentiment'], test_size=0.2, random_state=900)

x_train_tfid.shape, x_test_tfid.shape, y_train_tfid.shape, y_test_tfid.shape

### RandomForest cvec_representation

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [29]:
rf = RandomForestClassifier(max_depth=10, random_state=0)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, random_state=0)

In [30]:
y_pred = rf.predict(x_test)

In [45]:
print(f'RandomForestClassifier accuracy cvec:{accuracy_score(y_pred, y_test)}')

RandomForestClassifier accuracy cvec:0.8017


### RandomForest tfid_representation

In [42]:
rf_tfid = RandomForestClassifier()
rf_tfid.fit(x_train_tfid, y_train_tfid)

RandomForestClassifier()

In [43]:
y_pred_tfid = rf.predict(x_test_tfid)

In [44]:
print(f'RandomForestClassifier accuracy tfid:{accuracy_score(y_pred_tfid, y_test_tfid)}')

RandomForestClassifier accuracy tfid:0.507


In [None]:
rf_tfid

In [None]:
# import classifiers

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [None]:
# mnb = MultinomialNB()
# lrc = LogisticRegression()
# svc = SVC()

In [None]:
# clfs = {
#     'MultiNB' : mnb, 
#     'LogisticRegression' : lrc,
# #     'SVM' : svc
# }

In [None]:
# def get_scores(clfs, n_cv=2):
#     for clf in clfs: 
#         cvec_scores = cross_val_score(clfs[clf], cvec_representation, tweets['sentiment_encoded'], n_jobs=4, cv=n_cv, scoring='accuracy')
#         tfidf_scores = cross_val_score(clfs[clf], tfid_representation,  tweets['sentiment_encoded'], n_jobs=4, cv=n_cv, scoring='accuracy')
        
#         cvec_mean_score, cvec_std_score = np.mean(cvec_scores), np.std(cvec_scores)
#         tfidf_mean_score, tfidf_std_score = np.mean(tfidf_scores), np.std(tfidf_scores)
#         print(f''' 
#         {clf}
#         CountVectorizer score: {cvec_mean_score:.3f}, std: {cvec_std_score:.3f}
#         TfIdf score: {tfidf_mean_score:.3f}, std: {tfidf_std_score:.3f}
#         ''')
        

In [None]:
# get_scores(clfs, 2)