In [1]:
from os import listdir
from os.path import isfile,join
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import hstack
import tensorflow as tf
from keras.layers import Dense
import keras
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

Using TensorFlow backend.


In [2]:
mypath=os.getcwd()+"/aclImdb/train/"

In [3]:
def get_files(path:str):
    pos_path=path+"/pos/"
    neg_path=path+"/neg/"
    pos_files = [ file for file in listdir(pos_path) if isfile(join(pos_path,file))]
    neg_files=[ file for file in listdir(neg_path) if isfile(join(neg_path,file))]
    return [pos_files,neg_files]

In [4]:
pos_files,neg_files=get_files(mypath)

In [5]:

def get_texts(path,pos_files,neg_files):
    texts=[]
    pos_path=path+"/pos/"
    neg_path=path+"/neg/"
    for i in pos_files:
        with open(pos_path+i,"r") as fin:
            text=fin.read()
            fin.close()
        filename=os.path.splitext(i)[0]
        label=filename.split("_")[1]
        texts.append([label,text])
    for i in neg_files:
        with open(neg_path+i,"r") as fin:
            text=fin.read()
            fin.close()
        filename=os.path.splitext(i)[0]
        label=filename.split("_")[1]
        texts.append([label,text])
    texts=np.array(texts)
    return texts

In [6]:
texts=get_texts(mypath,pos_files,neg_files)

In [7]:
np.random.shuffle(texts)

In [None]:
tfidf=TfidfVectorizer(ngram_range=(1,5),min_df=0.0008)
features=tfidf.fit_transform(texts[:,1])

In [None]:
print(len(tfidf.get_feature_names()))
print(features.shape)

In [None]:
labels=[[round(float(text[0])/10.0)] for text in texts]
print(labels[0:10])

[[0], [1], [1], [1], [0], [0], [0], [1], [0], [0]]


In [None]:

def save_data(file:str,texts,features):
    labels=[[round(float(text[0])/10.0)] for text in texts]
    labels=np.array(labels)
    features=hstack((labels,features))

    pd.DataFrame(
        features.todense(),
        columns=['label']+tfidf.get_feature_names()
    ).to_csv(file)

In [None]:
save_data("./train_data.csv",texts,features)

In [None]:
mypath=os.getcwd()+"/aclImdb/test/"
pos_files,neg_files=get_files(mypath)
texts=get_texts(mypath,pos_files,neg_files)
np.random.shuffle(texts)

In [None]:
features=tfidf.transform(texts[:,1])

In [None]:
save_data("./test_data.csv",texts,features)

In [None]:
header=pd.read_csv("./train_data.csv",iterator=True).get_chunk(1).columns.tolist()

In [None]:
print(len(header))

In [None]:
reader=pd.read_csv("./train_data.csv",iterator=True,chunksize=1000)

In [None]:
for i in reader:
    print(np.array(i.iloc[:,2:].values).shape)
    break

In [None]:
config=tf.ConfigProto()
config.gpu_options.allow_growth=True
config.gpu_options.per_process_gpu_memory_fraction=0.1
tf.reset_default_graph()
sess=tf.InteractiveSession(config=config)
sess.as_default()

In [None]:

with tf.device('/device:GPU:0'):
    X=tf.placeholder(shape=(None,len(header)-2),dtype=tf.float32)
    y=tf.placeholder(shape=(None),dtype=tf.float32)
    network=keras.models.Sequential()
    network.add(Dense(1024,activation=tf.nn.relu,input_shape=[len(header)-2]))
    network.add(Dense(512,activation=tf.nn.relu))
    network.add(Dense(256,activation=tf.nn.relu))
    network.add(Dense(128,activation=tf.nn.relu))
    network.add(Dense(64,activation=tf.nn.relu))
    network.add(Dense(1,activation=tf.nn.sigmoid))
    y_prime=network(X)
    #loss=tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=y_prime)
    loss=tf.reduce_mean((y-y_prime)**2)
    trainer=tf.train.GradientDescentOptimizer(1e-2).minimize(loss)

In [None]:
train_data=pd.read_csv("./train_data.csv",iterator=True,chunksize=100)

In [None]:

max_iter=10000
losses=[]

sess.run(tf.global_variables_initializer())
for data in train_data:
    train_X=np.array(data.iloc[:,2:].values)
    train_y=np.array(data['label'].values)

    for i in range(max_iter):
        _,l=sess.run([trainer,loss],{X:train_X,y:train_y})
        losses.append(l)

    clear_output(True)
    plt.figure(figsize=[48, 20])
    plt.title("Loss")
    plt.plot(losses)
    plt.show()

In [None]:
test_data=pd.read_csv("./train_data.csv",iterator=True,chunksize=1000)

In [None]:
ACCURACY=[]
for data in test_data:
    test_X=np.array(data.iloc[:,2:].values)
    test_y=np.array(data['label'].values)
    y=sess.run(y_prime,{X:test_X})
    y=np.round(y)
    accuracy=np.equal(y,test_y)
    accuracy=np.count_nonzero(accuracy)/float(len(y))
    ACCURACY.append(accuracy)
    
    clear_output(True)
    plt.figure(figsize=[48, 4])
    plt.title("Accuracy")
    plt.plot(ACCURACY)
    plt.show()

In [None]:
print(sum(ACCURACY)/len(ACCURACY))