In [1]:
import pandas as pd
import numpy as np
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os

In [2]:
maindir = os.getcwd()
maindir

'/home/jupyter-jmabel/401_Project2'

In [3]:
os.chdir("/../../../datasets/aclImdb/train")
os.listdir()

['pos',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt',
 'neg',
 'labeledBow.feat',
 'unsupBow.feat',
 'unsup']

In [4]:
os.chdir("pos")
pos_files = os.listdir()
pos_reviews = []
for i in pos_files:
    f = open(i,'r')
    rev = f.read()
    pos_reviews.append(rev)
    f.close()

In [5]:
os.chdir("../neg")
neg_files = os.listdir()
neg_reviews = []
for i in neg_files:
    f = open(i,'r')
    rev = f.read()
    neg_reviews.append(rev)
    f.close()

In [6]:
pos_df = pd.DataFrame({"File":pos_files,"Review":pos_reviews,"Label":[1]*len(pos_files)})
neg_df = pd.DataFrame({"File":neg_files,"Review":neg_reviews,"Label":[-1]*len(neg_files)})

Train_df = pd.concat([pos_df,neg_df])
Train_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,Walt Disney's CINDERELLA takes a story everybo...,1
1,7944_9.txt,"Have you ever, or do you have, a pet who's bee...",1
2,11725_10.txt,"I suck at gratuitous Boob references, so i'm j...",1
3,1587_10.txt,"Does anyone know, where I can see or download ...",1
4,10297_8.txt,Well not actually. This movie is very entertai...,1


In [7]:
import nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     jmabel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter-jmabel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
all_words = []

stop_words = list(set(stopwords.words('english')))
#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for rev in pos_df["Review"]:
    
    # remove punctuations
    cleaned = re.sub(r'[^(a-zA-Z)\s]','', rev)
    
    # tokenize 
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stop_words]
    
    # parts of speech tagging for each word 
    pos = nltk.pos_tag(stopped)
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())   
            
for rev in neg_df["Review"]:
    
    # remove punctuations
    cleaned = re.sub(r'[^(a-zA-Z)\s]','', rev)
    
    # tokenize 
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stop_words]
    
    # parts of speech tagging for each word 
    pos = nltk.pos_tag(stopped)
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())   

In [9]:
# creating a frequency distribution of each adjectives. 
BOW = nltk.FreqDist(all_words)

# listing the 5000 most frequent words
word_features = list(BOW.keys())[:5000]
word_features[0], word_features[-1]

('familiar', 'anythingbr')

In [10]:
# function to create a dictionary of features for each review in the list document.
# The keys are the words in word_features 
# The values of each key are either true or false for wether that feature appears in the review or not
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

# Creating features for each review
featuresets = []

for i, row in Train_df.iterrows():
    featuresets.append((find_features(row["Review"]), row["Label"]))

In [11]:
df = pd.DataFrame.from_dict(featuresets)
df.head()

Unnamed: 0,0,1
0,"{'familiar': True, 'suspense': True, 'essentia...",1
1,"{'familiar': False, 'suspense': False, 'essent...",1
2,"{'familiar': False, 'suspense': False, 'essent...",1
3,"{'familiar': False, 'suspense': False, 'essent...",1
4,"{'familiar': False, 'suspense': False, 'essent...",1


In [12]:
df_train = pd.json_normalize(df[0])
df_train = df_train.astype(int)
df_train["Label"] = df[1]
df_train.head()

Unnamed: 0,familiar,suspense,essential,delectable,fairy,majestic,gorgeous,gown,predictable,titular,...,raucous,limo,recognisable,relate,ease,penultimate,refused,cal,anythingbr,Label
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
os.chdir(maindir)
df_train.to_csv("Training_data2.csv")

In [14]:
os.chdir("/../../../datasets/aclImdb/test")

In [15]:
os.chdir("pos")
pos_files = os.listdir()
pos_reviews = []
for i in pos_files:
    f = open(i,'r')
    rev = f.read()
    pos_reviews.append(rev)
    f.close()

In [16]:
os.chdir("../neg")
neg_files = os.listdir()
neg_reviews = []
for i in neg_files:
    f = open(i,'r')
    rev = f.read()
    neg_reviews.append(rev)
    f.close()

In [17]:
pos_df = pd.DataFrame({"File":pos_files,"Review":pos_reviews,"Label":[1]*len(pos_files)})
neg_df = pd.DataFrame({"File":neg_files,"Review":neg_reviews,"Label":[-1]*len(neg_files)})

Test_df = pd.concat([pos_df,neg_df])
Test_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,"""Rush in Rio"" is, no doubt, one of the most ex...",1
1,8705_10.txt,I have seen a number of horror movies to know ...,1
2,11725_10.txt,I'm a fan of B grade 80s films in which the he...,1
3,9859_8.txt,"I think that Pierre Léaud, or his character, t...",1
4,12409_10.txt,This picture doesn't have any big explosions o...,1


In [18]:
for i, row in Test_df.iterrows():
    featuresets.append((find_features(row["Review"]), row["Label"]))
    
df2 = pd.DataFrame.from_dict(featuresets)
df2.head()

df_test = pd.json_normalize(df[0])
df_test = df_train.astype(int)

KeyError: 1

In [19]:
df_test["Label"] = df2[1]
df_test.head()

Unnamed: 0,familiar,suspense,essential,delectable,fairy,majestic,gorgeous,gown,predictable,titular,...,raucous,limo,recognisable,relate,ease,penultimate,refused,cal,anythingbr,Label
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
os.chdir(maindir)
df_test.to_csv("Testing_data2.csv")