In [1]:
from sklearn.datasets import load_files
import re 
import pandas as pd
import numpy as np
pattern = r'https?://\S+|www\.\S+|[.,;/!\n\-@#$%^&*()_+=\[\]{}|\\:"<>?`~\']'



In [2]:
train = load_files("./data/aclImdb/train", encoding="utf8")
x_train, y_train = train.data, train.target

test = load_files("./data/aclImdb/test")
x_test, y_test = test.data, test.target

train.target_names

['neg', 'pos']

#### Problem 1

In [3]:
sentences = {
    "sent1": "This movie is SOOOO funny!!!", 
    "sent2": "What a movie! I never", 
    "sent3": "best movie ever!!!!! this movie"
}


def replace_and_retain_newlines(match):
    return '\n' if match.group(0) == '\n' else ''

#proccessing text
def processText(sentences):
    word = "\n".join(sentences)
    sentences = re.sub(pattern, replace_and_retain_newlines, word).lower().strip().split("\n")
    return sentences



#Bag of words algorithm
def bagOfWords(words, dimiliter=" ", max_features=None):
    #join sentences without losing position
    sentence = "\n".join(words)
    word = " ".join(words)

    # get keys 
    keys = set(word.split(dimiliter)) 

    # maximum features to be selected
    if max_features is None:
        max_features = len(keys)     
    keys = list(keys)[:max_features]
    # stores words in each sentence
    temp = pd.Series(sentence.split("\n")).apply(str.split)
    frame = pd.DataFrame(np.zeros((len(words), len(keys))), columns=keys)

    #O(t) = max_features
    for key in keys:  
        frame[key] = temp.apply(lambda x: x.count(key))

    return frame

#n-gram algorithm
def ngram(words, n):
    rows = len(words)
    result = []
    for i in range(rows):#loops through each sentence in list
        test = words[i].split()
        word = ""
        loop = len(test) -n + 1
        for j in range(loop):# joins the first n words in sentence
            word += " ".join(test[j:j+n]) + " "
            # if j == loop -1:
            #     continue
            word += ", "
        
        result.append(word)

    
    return result




In [4]:
values = processText(list(sentences.values()))
values

['this movie is soooo funny',
 'what a movie i never',
 'best movie ever this movie']

In [36]:
h = bagOfWords(values)
h

Unnamed: 0,is,movie,soooo,i,never,ever,what,this,best,a,funny
0,1,1,1,0,0,0,0,1,0,0,1
1,0,1,0,1,1,0,1,0,0,1,0
2,0,2,0,0,0,1,0,1,1,0,0


In [12]:
np.sum(h.apply(lambda x: x/np.sum(x), axis=1), axis=1)

0    1.0
1    1.0
2    1.0
dtype: float64

In [46]:

bi = ngram(values, 2)

bagOfWords(bi, ", ")


Unnamed: 0,Unnamed: 1,a movie,movie i,what a,movie ever,soooo funny,movie is,this movie,is soooo,i never,best movie,ever this
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0


#### Problem 2

In [47]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression


#### Problem 3

In [52]:
vect = CountVectorizer(stop_words="english", max_features=5000)
transformer = TfidfTransformer(norm=None)
b_train = transformer.fit_transform(vect.fit_transform(x_train).toarray())
b_test = transformer.fit_transform(vect.fit_transform(x_test).toarray())


In [216]:
model = LogisticRegression(penalty="l2", max_iter=500).fit(b_train, y_train)
model.score(b_test, y_test)

#### Problem 4: Scratch mounting of TF-IDF 

##### Standard implementation

In [83]:
# Standard implementation of TF-IDF

# Term frequency = number of repititions of words in sentence /number of words in sentence

# Inverse document frequency= log{number of sentences/number of sentences containing word}

class TfIdfVectorizer:
    def __init__(self) -> None:
        pass

    def fit_transform(self, words):
        self.__bagOfWords(words)
        numofsent = self.frame.shape[0] # number of sentences
        tf = self.frame.apply(lambda x: x/np.sum(x), axis=1) # frequency of words in sentence / total words in sentence
        idf = np.log(numofsent / self.frame.apply(lambda x: np.sum(x!=0), axis=0)) # log {number of sentences / number of sentences containing word}

        return tf*idf
        
    

    def __bagOfWords(self, words, dimiliter=" ", max_features=None):
        #join sentences without losing position
        sentence = "\n".join(words)
        word = " ".join(words)

        # get keys 
        keys = set(word.split(dimiliter)) 

        # maximum features to be selected
        if max_features is None:
            max_features = len(keys)     
        self.keys = list(keys)[:max_features]
        # stores words in each sentence
        temp = pd.Series(sentence.split("\n")).apply(str.split)
        frame = pd.DataFrame(np.zeros((len(words), len(self.keys))), columns=self.keys)

        #O(t) = max_features
        for key in self.keys:  
            frame[key] = temp.apply(lambda x: x.count(key))

        self.frame = frame

In [41]:
values

['this movie is soooo funny',
 'what a movie i never',
 'best movie ever this movie']

In [84]:
TfIdfVectorizer().fit_transform(values)

Unnamed: 0,is,movie,soooo,i,never,ever,what,this,best,a,funny
0,0.219722,0.0,0.219722,0.0,0.0,0.0,0.0,0.081093,0.0,0.0,0.219722
1,0.0,0.0,0.0,0.219722,0.219722,0.0,0.219722,0.0,0.0,0.219722,0.0
2,0.0,0.0,0.0,0.0,0.0,0.219722,0.0,0.081093,0.219722,0.0,0.0


##### Scikit learn implementation

In [90]:
# Scikit learn implementation of TF-IDF

# Term frequency = frequency of words 

# Inverse document frequency= log{1 + number of sentences/ 1 + number of sentences containing word}+1

class TfIdfVectorizer2:
    def __init__(self) -> None:
        pass

    def fit_transform(self, words):
        self.__bagOfWords(words)
        numofsent = self.frame.shape[0] + 1# number of sentences
        tf = self.frame # frequency of words
        idf = np.log(numofsent / self.frame.apply(lambda x: np.sum(x!=0)+1, axis=0)) + 1 # log {number of sentences / number of sentences containing word}

        return tf*idf
        
    

    def __bagOfWords(self, words, dimiliter=" ", max_features=None):
        #join sentences without losing position
        sentence = "\n".join(words)
        word = " ".join(words)

        # get keys 
        keys = set(word.split(dimiliter)) 

        # maximum features to be selected
        if max_features is None:
            max_features = len(keys)     
        self.keys = list(keys)[:max_features]
        # stores words in each sentence
        temp = pd.Series(sentence.split("\n")).apply(str.split)
        frame = pd.DataFrame(np.zeros((len(words), len(self.keys))), columns=self.keys)

        #O(t) = max_features
        for key in self.keys:  
            frame[key] = temp.apply(lambda x: x.count(key))

        self.frame = frame


In [91]:
TfIdfVectorizer2().fit_transform(values)

Unnamed: 0,is,movie,soooo,i,never,ever,what,this,best,a,funny
0,1.693147,1.0,1.693147,0.0,0.0,0.0,0.0,1.287682,0.0,0.0,1.693147
1,0.0,1.0,0.0,1.693147,1.693147,0.0,1.693147,0.0,0.0,1.693147,0.0
2,0.0,2.0,0.0,0.0,0.0,1.693147,0.0,1.287682,1.693147,0.0,0.0
