In [1]:
train_path = "./resource/asnlib/public/aclImdb/train/" # use terminal to ls files under this directory
test_path = "./resource/asnlib/public/imdb_te.csv" # test data for grade evaluation

import glob
from string import punctuation
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
stop_words = []
with open("stopwords.en.txt") as file:
    for line in file:
        word = line.strip()
        stop_words.append(word)

In [3]:
def remove_stop_words(text):
    translator = str.maketrans('', '', punctuation)
    query_words = text.translate(translator).split()
    result = [word for word in query_words if word not in stop_words]
    return ' '.join(result)

In [4]:
def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False):
    '''Implement this module to extract
    and combine text files under train_path directory into 
    imdb_tr.csv. Each text file in train_path should be stored 
    as a row in imdb_tr.csv. And imdb_tr.csv should have two 
    columns, "text" and label'''
    pos_text = glob.glob(inpath+"pos/*.txt")
    neg_text = glob.glob(inpath+"neg/*.txt")
    
    count = 0
    with open(outpath + name, "w", encoding="utf8") as outfile:
        for file in pos_text:
            with open(file, "r", encoding="utf8") as infile:
                text = remove_stop_words(infile.read().lower())
                outfile.write(str(count)+","+text+",1\n")
            count += 1

        for file in neg_text:
            with open(file, "r", encoding="utf8") as infile:
                text = remove_stop_words(infile.read().lower())
                outfile.write(str(count)+","+text+",0\n")
            count += 1
            
    data = pd.read_csv(outpath+name, header=None)
    return data

In [49]:
df = imdb_data_preprocess(train_path)

In [61]:
X_train = df.iloc[:, 1].as_matrix()
#X_train = np.array(df.iloc[:, 1])
y_train = df.iloc[:,2].as_matrix()
#y_train = np.array(df.iloc[:,2])
#test = pd.read_csv("./resource/asnlib/public/imdb_te.csv")


In [62]:
y_train.shape

(25000,)

In [63]:
X_train.shape  # maybe need to make X_train a list of lists?

(25000,)

In [64]:
X_train[0]

'bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell highs satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity isnt'

In [104]:
vectorizer = CountVectorizer(min_df=1)

In [105]:
X_train_vec = vectorizer.fit_transform(X_train)

In [107]:
vocab = vectorizer.vocabulary_

In [108]:
len(vocab)

120611

In [109]:
X_train_vec

<25000x120611 sparse matrix of type '<class 'numpy.int64'>'
	with 2542121 stored elements in Compressed Sparse Row format>

In [110]:
clf = SGDClassifier().fit(X_train_vec, y_train)

In [111]:
test_df = pd.read_csv("./resource/asnlib/public/imdb_te.csv", encoding='ISO-8859-1')

In [112]:
X_test = test_df.iloc[:,1].as_matrix()
X_test[0]

"Oh gosh!! I love movie sooooooooooooooooooooo much!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! It incredible......I loved wee babe still love adult. It favorite Disney movie allllllllllllllllllllllllllllll time! You watch it, watch love it. My friends I watch ton.....It soooooooooooooooooooooooooooooooooo good. I recommend anyone child child heart. My favorite part song dance number strays Thomas O'Malley. The writers/producers/director completely nailed one.....yeah, nailed wall.xoxo~Wolly~xoxo"

In [113]:
X_test_vec = vectorizer.transform(X_test)

In [116]:
predict = clf.predict(X_test_vec)

In [122]:
predict[0]

1