<a href="https://colab.research.google.com/github/konrad1001/sentimentally/blob/master/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading the CSV file
First step is to read our csv file into a pandas dataframe.

In [98]:
import pandas as pd
import numpy as np


# 20000x6000 start: 15:47 end 15:58

df = pd.read_csv("/content/drive/MyDrive/computers/IMDB Dataset.csv")

In [99]:
# set constants

NUMBER_OF_DOCUMENTS = 40000
NUMBER_OF_WORDS = 8000


#Basic Cleaning

Stripping non alphabet characters, converting to lowercase, and removing the html tags left over.

In [100]:
import re

df = df[:NUMBER_OF_DOCUMENTS]

df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))   #remove non-alphabets
df['review'] = df['review'].apply(lambda x: x.lower())  #convert to lowercase
df['review'] = df['review'].apply(lambda x: re.sub(r'br', '', x))  #remove pesky leftover <br> tags

print(df)

                                                  review sentiment
0      one of the other reviewers has mentioned that ...  positive
1      a wonderful little production          the fil...  positive
2      i thought this was a wonderful way to spend ti...  positive
3      basically there s a family where a little boy ...  negative
4      petter mattei s  love in the time of money  is...  positive
...                                                  ...       ...
39995  this was a marvelously funny comedy with a gre...  positive
39996  there is no plot  there are no central charact...  positive
39997  this show is awesome  i love all the actors  i...  positive
39998  the fact that this movie has been entitled to ...  negative
39999  i have to confess that i am severely disappoin...  negative

[40000 rows x 2 columns]


#Import dictionary

In [101]:
from os import terminal_size
#import dictionary

import numpy as np

class Dictionary:
    FILE_PATH = "/content/drive/MyDrive/computers/dictionary.txt"
    def __init__(self, length) -> None:
      self.length = length
      self.dictionary = np.array([])
      self.document_frequency_array = np.zeros(length)
      self.length = length
      self.load(self.FILE_PATH)

    def load(self, filename: str) -> None:
      self.dictionary = np.loadtxt(filename, dtype=str)[:self.length]

    def get(self, index: int) -> str:
      return self.dictionary[index]

    def get_index(self, word: str) -> int:
        try:
            return np.where(self.dictionary == word)[0][0]
        except IndexError:
            return -1

    def get_size(self) -> int:
      return self.length

    def encode(self, vector):
      encoded_vector = np.zeros(self.length)
      term_frequency = np.zeros(self.length)
      for index in vector:
        if index != -1:
          encoded_vector[index] += 1
          term_frequency[index] = 1

      self.document_frequency_array += term_frequency

      return encoded_vector

    def get_document_frequency(self):
      return self.document_frequency_array


#Convert to a Tokenised vector
While we're at it lets keep a track of how long each review is, how many words make it up. This will be useful for calculating the term frequnecy

In [102]:

d = Dictionary(length=NUMBER_OF_WORDS) #change

df['review'] = df['review'].apply(lambda x: x.split())
df['review'] = df['review'].apply(lambda x: [d.get_index(word) for word in x]) #remove words not in dictionary and tokenize
df['review'] = df['review'].apply(lambda x: [val for val in x if val != -1])
df['length'] = df['review'].apply(lambda x: len(x))
#df = df[df['review'] != -1]

print(df)


                                                  review sentiment  length
0      [44, 1, 0, 45, 39, 2783, 9, 149, 3494, 108, 26...  positive     271
1      [4, 2766, 397, 883, 0, 3844, 7, 173, 173, 370,...  positive     142
2      [13, 990, 11, 29, 4, 2766, 202, 3, 2918, 49, 8...  positive     144
3      [5270, 62, 89, 4, 261, 158, 4, 397, 1392, 6148...  negative     112
4      [89, 369, 5, 0, 49, 1, 390, 7, 4, 7024, 676, 3...  positive     202
...                                                  ...       ...     ...
39995  [11, 29, 4, 2346, 3312, 12, 4, 208, 2393, 371,...  positive     115
39996  [62, 7, 47, 3869, 62, 19, 47, 685, 2085, 62, 1...  positive     241
39997  [11, 267, 7, 4490, 13, 369, 24, 0, 5360, 15, 3...  positive     127
39998  [0, 853, 9, 11, 489, 39, 84, 3962, 3, 0, 126, ...  negative     217
39999  [13, 25, 3, 9, 13, 83, 11, 291, 33, 5, 47, 202...  negative     222

[40000 rows x 3 columns]


#Encode
We can now convert our token vectors into a bag of words.

In [103]:

df['review'] = df['review'].apply(lambda x: d.encode(x))


#Compute Term Frequency, Inverse Document Frequency

In [104]:
def TF(vector):
  total_number_of_terms = np.sum(vector)
  return vector / total_number_of_terms

def IDF(vector):
  N_documents_in_corpus = np.full(NUMBER_OF_WORDS, NUMBER_OF_DOCUMENTS)
  N_documents_containing_term = d.get_document_frequency() + 1

  return np.log(N_documents_in_corpus / N_documents_containing_term)

def TF_IDF(vector):
  return TF(vector) * IDF(vector)

computed_TFIDF = pd.DataFrame(df['sentiment'])
computed_TFIDF['review as TFIDF'] = df['review'].apply(lambda x: TF_IDF(x))

positive = computed_TFIDF[df['sentiment']=='positive']
negative = computed_TFIDF[df['sentiment']=='negative']


#Output to NPZ files
Output to the numpy format that will let us quickly access the values later.

In [105]:
DIMENSIONS = str(NUMBER_OF_DOCUMENTS) + "x" + str(NUMBER_OF_WORDS)
#computed_TFIDF.to_csv('/content/drive/MyDrive/computers/TFIDF' + DIMENSIONS + '.csv', index=False)

np.savez('/content/drive/MyDrive/computers/Sentimentally/positiveTFIDF' + DIMENSIONS + '.npz', *positive['review as TFIDF'].tolist())
np.savez('/content/drive/MyDrive/computers/Sentimentally/negativeTFIDF' + DIMENSIONS + '.npz', *negative['review as TFIDF'].tolist())

