In [1]:
import csv
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np


def buildTrainingDataSet():
    trainingData=[]
    sentiment=[]
    with open("imdb_small.csv", mode='rt') as datafile:
        data=csv.reader(datafile,delimiter=',',quotechar='\"')
        count=0
        for row in data:
            if count!=0:
                trainingData.append(row[0])
                sentiment.append(row[1])
            count+=1
    return (trainingData, sentiment)
    

class PreProcessTrainingData:
    def preProcess(self, trainingData):
        preProcess=[]
        for review in trainingData:
            preProcess.append(self.process(review))
        return preProcess
    
    def process(self, review):
        review=review.lower()
        review=re.sub('[\!]+'," exclaim",review)
        review=re.sub(r'(.)\1+', r'\1', review)
        return review

class lemmatize_tokenize:
    def __init__(self):
        self.punct=list(punctuation)
        self.stwrds= set(stopwords.words('english')+self.punct+["<br />"]+[''])
        self.porter = PorterStemmer()
    
    def __call__(self, review):
        review=word_tokenize(review)
        review=[WordNetLemmatizer().lemmatize(word) for word in review]
        review = [self.porter.stem(word) for word in review]
        return [word for word in review if word not in self.stwrds and len(word)>3]
    

def Bigram_Tokenizer(processedData, processedTestData):
    # Bigram Counts
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),tokenizer=lemmatize_tokenize())
    bigram_vectorizer.fit(processedData)
    X_train_bigram = bigram_vectorizer.transform(processedData)
    X_test_bigram = bigram_vectorizer.transform(processedTestData)
    # Bigram Tf-Idf
    bigram_tf_idf_transformer = TfidfTransformer()
    bigram_tf_idf_transformer.fit(X_train_bigram)
    X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
    X_test_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_test_bigram)
    return X_train_bigram_tf_idf,X_test_bigram_tf_idf


def Bigram(processedData, processedTestData):
    # Bigram Counts
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigram_vectorizer.fit(processedData)
    X_train_bigram = bigram_vectorizer.transform(processedData)
    X_test_bigram = bigram_vectorizer.transform(processedTestData)
    # Bigram Tf-Idf
    bigram_tf_idf_transformer = TfidfTransformer()
    bigram_tf_idf_transformer.fit(X_train_bigram)
    X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
    X_test_bigram_tf_idf= bigram_tf_idf_transformer.transform(X_test_bigram)
    return X_train_bigram_tf_idf, X_test_bigram_tf_idf

def train_test_scores(X: csr_matrix, Y: np.array,test:csr_matrix, title: str) -> None:
    clf = SGDClassifier()
    X_train, X_valid, y_train, y_valid = train_test_split(X, Y, train_size=0.50, random_state=42)
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 4)} ; Validation score: {round(valid_score, 4)}\n')
    clf.fit(X,Y)
    print(clf.predict(test))
    
trainingData, sentiment=buildTrainingDataSet()
PrePro=PreProcessTrainingData()
processedData=PrePro.preProcess(trainingData)



test=[]
with open('test.csv', mode='rt') as testfile:
    testcase=csv.reader(testfile)
    for row in testcase:
        test.append(str(row))

processedTestData=PrePro.preProcess(test)
train_tok, test_tok=Bigram_Tokenizer(processedData,processedTestData)
train_norm, test_norm=Bigram(processedData,processedTestData)
print(train_norm.shape)
print(test_norm.shape)
train_test_scores(train_tok, sentiment, test_tok, 'Bigram Tf-Idf with tokenizer')
train_test_scores(train_norm, sentiment,test_norm ,'Bigram Tf-Idf without tokenizer')



(5000, 470610)
(13, 470610)
Bigram Tf-Idf with tokenizer
Train score: 1.0 ; Validation score: 0.846

['negative' 'negative' 'negative' 'positive' 'negative' 'positive'
 'negative' 'negative' 'positive' 'positive' 'positive' 'negative'
 'negative']
Bigram Tf-Idf without tokenizer
Train score: 1.0 ; Validation score: 0.8596

['negative' 'negative' 'negative' 'positive' 'negative' 'positive'
 'negative' 'negative' 'positive' 'positive' 'negative' 'negative'
 'negative']
