In [1]:
import pandas as pd
import numpy as np
import nltk
import random
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
df_train = pd.read_csv("reviews_train.csv").drop("Unnamed: 0", axis=1)
df_test = pd.read_csv("reviews_test.csv").drop("Unnamed: 0", axis=1)

# Creating Feature Set # 2

In [3]:
all_words = []

stop_words = list(set(stopwords.words('english')))
#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for rev in df_train["Review"]:
    
    # remove punctuations
    cleaned = re.sub(r'[^(a-zA-Z)\s]','', rev)
    
    # tokenize 
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stop_words]
    
    # parts of speech tagging for each word 
    pos = nltk.pos_tag(stopped)
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())   

In [4]:
# creating a frequency distribution of each adjectives. 
BOW = nltk.FreqDist(all_words)

# listing the 5000 most frequent words
word_features = list(BOW.keys())[:5000]
word_features[0], word_features[-1]

('familiar', 'anythingbr')

In [5]:
# function to create a dictionary of features for each review in the list document.
# The keys are the words in word_features 
# The values of each key are either true or false for wether that feature appears in the review or not
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

# Creating features for each review
featuresets = []

for i, row in df_train.iterrows():
    featuresets.append((find_features(row["Review"]), row["Label"]))

# Train

In [6]:
df = pd.DataFrame.from_dict(featuresets)
df.head()

Unnamed: 0,0,1
0,"{'familiar': True, 'suspense': True, 'essentia...",1
1,"{'familiar': False, 'suspense': False, 'essent...",1
2,"{'familiar': False, 'suspense': False, 'essent...",1
3,"{'familiar': False, 'suspense': False, 'essent...",1
4,"{'familiar': False, 'suspense': False, 'essent...",1


In [7]:
df_train = pd.json_normalize(df[0])
df_train = df_train.astype(int)
df_train["Label"] = df[1]
df_train.head()

Unnamed: 0,familiar,suspense,essential,delectable,fairy,majestic,gorgeous,gown,predictable,titular,...,raucous,limo,recognisable,relate,ease,penultimate,refused,cal,anythingbr,Label
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Test

In [8]:
for i, row in df_test.iterrows():
    featuresets.append((find_features(row["Review"]), row["Label"]))
    
df2 = pd.DataFrame.from_dict(featuresets)
df2.head()

df_test = pd.json_normalize(df[0])
df_test = df_train.astype(int)

In [9]:
df_test["Label"] = df2[1]
df_test.head()

Unnamed: 0,familiar,suspense,essential,delectable,fairy,majestic,gorgeous,gown,predictable,titular,...,raucous,limo,recognisable,relate,ease,penultimate,refused,cal,anythingbr,Label
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Saving Train and Test DataFrames

In [10]:
df_train.to_csv("train2.csv")
df_test.to_csv("test2.csv")