In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from collections import Counter
from sklearn.dummy import DummyClassifier

In [2]:
# train data 
IMDB_train = pd.read_csv('IMDB-train copy.txt',sep = '\t',header = None)
yelp_train = pd.read_csv('yelp-train copy.txt',sep = '\t',header = None)

# validation data
IMDB_valid = pd.read_csv('IMDB-valid copy.txt',sep = '\t',header = None)
yelp_valid = pd.read_csv('yelp-valid copy.txt',sep = '\t',header = None)

# test data
IMDB_test = pd.read_csv('IMDB-test copy.txt',sep = '\t',header = None)
yelp_test = pd.read_csv('yelp-test copy.txt',sep = '\t',header = None)

# vocab
IMDB_vocab = pd.read_csv('IMDB-vocab copy.txt',sep = '\t',header = None)
yelp_vocab = pd.read_csv('yelp-vocab copy.txt',sep = '\t',header = None)

In [3]:
# preprocessing into bag of words and frequency of words 
def preprocessing_bow(data): 
    binary = np.zeros((len(data),10000))
    for i in range(len(data[0])):
        review = data.iloc[i,0].split(' ')
        for j in range(len(review)): 
            try:
                review[j]=int(review[j])
                binary[i][review[j]]=1
            except: 
                continue
    return binary

def preprocessing_fbow(data):
    frequency = np.zeros((len(data),10000))
    for i in range(len(data[0])):
        review = data.iloc[i,0].split(' ')
        for j in range(len(review)):
            try:
                review[j]=int(review[j])
                frequency[i][review[j]] += 1/len(review)
            except:
                continue
    return frequency      

In [4]:
IMDBt=preprocessing_bow(IMDB_train)
yelpt=preprocessing_bow(yelp_train)
IMDBv=preprocessing_bow(IMDB_valid)
yelpv=preprocessing_bow(yelp_valid)
IMDBtest=preprocessing_bow(IMDB_test)
yelptest=preprocessing_bow(yelp_test)

In [5]:
IMDBt_fbow=preprocessing_fbow(IMDB_train)
yelpt_fbow=preprocessing_fbow(yelp_train)
IMDBv_fbow=preprocessing_fbow(IMDB_valid)
yelpv_fbow=preprocessing_fbow(yelp_valid)
IMDBtest_fbow=preprocessing_fbow(IMDB_test)
yelptest_fbow=preprocessing_fbow(yelp_test)

In [6]:
#IMDBt_fbow=preprocessing_fbow(IMDB_train)
#print(IMDBt_fbow[0])
#print(sum(IMDBt_fbow[0]))

In [7]:
# majority class, random class 
def majclass(x,y,test): 
    clf = DummyClassifier(strategy='most_frequent')
    clf.fit(x,y)
    return clf.predict(test)

def randomclass(x,y,test):
    clf = DummyClassifier(strategy='uniform')
    clf.fit(x,y)
    return clf.predict(test)

In [8]:
# naive bayes
def naivebayes(x,y,a,test):
    clf = BernoulliNB(alpha = a)
    clf.fit(x,y)
    return clf.predict(test)

def naivegaus(x,y,test):
    clf = GaussianNB()
    clf.fit(x, y)
    return clf.predict(test)

In [9]:
###### HYPERPARAMETER TUNING for NAIVE BAYES
#parameters = np.linspace(0,20, num=20)
parameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 12, 20, 50]
#def findf1_score(IMDB_train,IMDBv,IMDB_valid,IMDBt,parameters): 
def testingnb(IMDB_valid, IMDBt, IMDB_train, parameters, IMDBv):
    maxindex=0
    maxf1=0
    for i in range(len(parameters)): 
        x=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], parameters[i], IMDBv),average='micro')
    #x1=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='micro')
    #x2=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='macro')
    #x3=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='samples')
        print('parameter:', parameters[i])
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=parameters[i]
    return maxf1, maxindex

In [10]:
# linear SVM
def linear(x,y,a,test):
    clf = LinearSVC(C=a, random_state=0, max_iter=100000)
    clf.fit(x, y)
    return clf.predict(test)
    

In [11]:
#linear SVM for IMDB
# linear SVM
def linearIMDB(x,y,a,test):
    clf = LinearSVC(C=a, random_state=0, dual=False, max_iter=100000)
    clf.fit(x, y)
    return clf.predict(test)
    

In [12]:
###### HYPERPARAMETER TUNING FOR LINEAR SVM
#parameters = np.linspace(0,20, num=20)
parameters = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
#def findf1_score(IMDB_train,IMDBv,IMDB_valid,IMDBt,parameters): 
def testinglinearIMDB(IMDB_valid, IMDBt, IMDB_train, parameters, IMDBv):
    maxindex=0
    maxf1=0
    for i in range(len(parameters)): 
        x=f1_score(IMDB_valid[1], linearIMDB(IMDBt, IMDB_train[1], parameters[i], IMDBv),average='micro')
    #x1=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='micro')
    #x2=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='macro')
    #x3=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='samples')
        print('linear parameter:', parameters[i])
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=parameters[i]
    return maxf1, maxindex

In [13]:
###### HYPERPARAMETER TUNING FOR LINEAR SVM
#parameters = np.linspace(0,20, num=20)
parameters = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
#def findf1_score(IMDB_train,IMDBv,IMDB_valid,IMDBt,parameters): 
def testinglinear(IMDB_valid, IMDBt, IMDB_train, parameters, IMDBv):
    maxindex=0
    maxf1=0
    for i in range(len(parameters)): 
        x=f1_score(IMDB_valid[1], linear(IMDBt, IMDB_train[1], parameters[i], IMDBv),average='micro')
    #x1=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='micro')
    #x2=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='macro')
    #x3=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='samples')
        print('linear parameter:', parameters[i])
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=parameters[i]
    return maxf1, maxindex

In [14]:
# decision tree 
def decisiontree(x,y,test):
    clf = DecisionTreeClassifier()
    clf.fit(x, y)
    return clf.predict(test)

In [15]:
 # decision tree WITH MAX DEPTH
def decisiontree_maxd(x,y,a,test):
    clf = DecisionTreeClassifier(max_depth=a)
    clf.fit(x, y)
    return clf.predict(test)

In [16]:
#### testing max-depth
maxdepths = np.linspace(1,32,32,endpoint=True)
def testingdt(v, xt, x_train, maxdepths, xv):
    maxindex=0
    maxf1=0
    for i in maxdepths: 
        x=f1_score(v[1], decisiontree_maxd(xt, x_train[1], i, xv), average='micro')
        print('parameter:', i)
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=i
    return maxf1, maxindex

In [17]:
# decision tree WITH MIN SAMPLES SPLIT
def decisiontree1(x,y,b,test):
    clf = DecisionTreeClassifier( min_samples_split=b,random_state=None)
    clf.fit(x, y)
    return clf.predict(test)

In [18]:
#### testing min_samples_split WITH MIN SAMPLES SPLIT
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
def testingdtmss(IMDB_valid, IMDBt, IMDB_train, min_samples_split, IMDBv):
    maxindex=0
    maxf1=0
    for i in min_samples_split: 
        x=f1_score(IMDB_valid[1], decisiontree1(IMDBt, IMDB_train[1], i, IMDBv), average='weighted')
        print('min_sample:', i)
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=i
    return maxf1, maxindex

#a=maxp_dt
#print(a)

In [19]:
# decision tree with min samples leaf 
def decisiontree2(x,y,c,test):
    clf = DecisionTreeClassifier(min_samples_leaf=c,random_state=None)
    clf.fit(x, y)
    return clf.predict(test)

In [20]:
#### testing min_sample_leaf
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
def testingdtmsl(IMDB_valid, IMDBt, IMDB_train, min_samples_leaf, IMDBv):
    maxindex=0
    maxf1=0
    for i in min_samples_leaf: 
        x=f1_score(IMDB_valid[1], decisiontree2(IMDBt, IMDB_train[1], i, IMDBv), average='weighted')
        print('min_sample:', i)
        print('x',x)
        if (x>maxf1): 
            maxf1=x
            maxindex=i
    return maxf1, maxindex
#a=maxp_dt
#b=maxpmss_dt
#print(a)
#print(b)

Question 2(a)

In [21]:
# F1_score for random and majority classifiers for yelp

f1_yelp= f1_score(yelp_test[1], randomclass(yelpt, yelp_train[1], yelptest), average='micro')
print("F1_measure for random classifier for yelp is:", f1_yelp)

f1_majority_yelp=f1_score(yelp_test[1], majclass(yelpt, yelp_train[1], yelptest), average='micro')
print("F1_measure for majority classifier for yelp is:", f1_majority_yelp)


F1_measure for random classifier for yelp is: 0.19
F1_measure for majority classifier for yelp is: 0.351


In [25]:
##### PARAMETER TUNING FOR YELP_BBOW, NAIVE BAYES, DECISION TREES, LINEAR SVM
parameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 12, 20, 50]
maxf1_nb_bow, maxp_nb_bow = testingnb(yelp_valid, yelpt, yelp_train, parameters, yelpv)
print("Max F1 for Naive Bayes:",maxf1_nb_bow)
print("seen at parameter:", maxp_nb_bow)

maxdepths = np.linspace(1,32,32,endpoint=True)
maxf1_dt_bow, maxp_dt_bow = testingdt(yelp_valid, yelpt, yelp_train, maxdepths, yelpv)
maxf1_yelporg = f1_score(yelp_valid[1],decisiontree(yelpt, yelp_train[1], yelpv), average='micro')
print("Original F1 score for decision tree without parameter tuning:", maxf1_yelporg)
print("Max F1 for DecisionTree:",maxf1_dt_bow)
print("seen at parameter:", maxp_dt_bow)

parameters_lin = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
maxf1_linear_bow, maxp_linear_bow = testinglinear(yelp_valid, yelpt, yelp_train, parameters_lin, yelpv)
print("Max F1 for Linear:",maxf1_linear_bow)
print("seen at parameter:", maxp_linear_bow)

  'setting alpha = %.1e' % _ALPHA_MIN)


parameter: 0
x 0.38
parameter: 0.0001
x 0.404
parameter: 0.001
x 0.42299999999999993
parameter: 0.01
x 0.426
parameter: 0.1
x 0.411
parameter: 1
x 0.384
parameter: 2
x 0.38499999999999995
parameter: 10
x 0.377
parameter: 12
x 0.378
parameter: 20
x 0.377
parameter: 50
x 0.38499999999999995
Max F1 for Naive Bayes: 0.426
seen at parameter: 0.01
parameter: 1.0
x 0.38499999999999995
parameter: 2.0
x 0.376
parameter: 3.0
x 0.376
parameter: 4.0
x 0.376
parameter: 5.0
x 0.378
parameter: 6.0
x 0.388
parameter: 7.0
x 0.393
parameter: 8.0
x 0.404
parameter: 9.0
x 0.391
parameter: 10.0
x 0.39
parameter: 11.0
x 0.372
parameter: 12.0
x 0.374
parameter: 13.0
x 0.374
parameter: 14.0
x 0.35800000000000004
parameter: 15.0
x 0.361
parameter: 16.0
x 0.352
parameter: 17.0
x 0.363
parameter: 18.0
x 0.375
parameter: 19.0
x 0.363
parameter: 20.0
x 0.36
parameter: 21.0
x 0.362
parameter: 22.0
x 0.36
parameter: 23.0
x 0.368
parameter: 24.0
x 0.35500000000000004
parameter: 25.0
x 0.353
parameter: 26.0
x 0.364999

In [26]:
####### Training, validation and test F1-measure for BBOW
nb_f1_train = f1_score(yelp_train[1], naivebayes(yelpt, yelp_train[1], maxp_nb_bow, yelpt),average='micro')
gb_f1_train = f1_score(yelp_train[1], naivegaus(yelpt, yelp_train[1], yelpt), average="micro")


nb_f1_valid = f1_score(yelp_valid[1], naivebayes(yelpt, yelp_train[1], maxp_nb_bow, yelpv),average='micro')
gb_f1_valid = f1_score(yelp_valid[1], naivegaus(yelpt, yelp_train[1], yelpv), average="micro")

nb_f1_test = f1_score(yelp_test[1], naivebayes(yelpt, yelp_train[1], maxp_nb_bow, yelptest),average='micro')
gb_f1_test = f1_score(yelp_test[1], naivegaus(yelpt, yelp_train[1], yelptest), average="micro")


dt_f1_train=f1_score(yelp_train[1], decisiontree_maxd(yelpt, yelp_train[1], maxp_dt_bow, yelpt), average='micro')
dt_f1_valid=f1_score(yelp_valid[1], decisiontree_maxd(yelpt, yelp_train[1], maxp_dt_bow, yelpv), average='micro')
dt_f1_test=f1_score(yelp_test[1], decisiontree_maxd(yelpt, yelp_train[1], maxp_dt_bow, yelptest), average='micro')

svm_f1_train=f1_score(yelp_train[1], linear(yelpt, yelp_train[1], maxp_linear_bow, yelpt),average='micro')
svm_f1_valid=f1_score(yelp_valid[1], linear(yelpt, yelp_train[1], maxp_linear_bow, yelpv),average='micro')
svm_f1_test=f1_score(yelp_test[1], linear(yelpt, yelp_train[1], maxp_linear_bow, yelptest),average='micro')

print("F1 FOR YELP BBOW")
print("Max F1 for Naive Bayes for training:", nb_f1_train)
print("seen at parameter:", maxp_nb_bow)
#print("Max F1 for Gaussian Bayes for training:", gb_f1_train)

print("Max F1 for Naive Bayes for validation:", nb_f1_valid)
print("seen at parameter:", maxp_nb_bow)
#print("Max F1 for Gaussian Bayes for validation:", gb_f1_valid)

print("Max F1 for Naive Bayes for testing:", nb_f1_test)
print("seen at parameter:", maxp_nb_bow)
#print("Max F1 for Gaussian Bayes for testing:", gb_f1_test)

print("Max F1 for Decision Tree for training:", dt_f1_train)
print("seen at parameter:",maxp_dt_bow)
print("Max F1 for Decision Tree for validation:", dt_f1_valid)
print("seen at parameter:",maxp_dt_bow)
print("Max F1 for Decision Tree for testing:", dt_f1_test)
print("seen at parameter:",maxp_dt_bow)

print("Max F1 for SVM for training:", svm_f1_train)
print("seen at parameter:",maxp_linear_bow)
print("Max F1 for SVM for validation:", svm_f1_valid)
print("seen at parameter:",maxp_linear_bow)
print("Max F1 for SVM for testing:", svm_f1_test)
print("seen at parameter:",maxp_linear_bow)

F1 FOR YELP BBOW
Max F1 for Naive Bayes for training: 0.7458571428571429
seen at parameter: 0.01
Max F1 for Naive Bayes for validation: 0.426
seen at parameter: 0.01
Max F1 for Naive Bayes for testing: 0.443
seen at parameter: 0.01
Max F1 for Decision Tree for training: 0.48014285714285715
seen at parameter: 8.0
Max F1 for Decision Tree for validation: 0.403
seen at parameter: 8.0
Max F1 for Decision Tree for testing: 0.3995
seen at parameter: 8.0
Max F1 for SVM for training: 0.8418571428571429
seen at parameter: 0.01
Max F1 for SVM for validation: 0.499
seen at parameter: 0.01
Max F1 for SVM for testing: 0.5075
seen at parameter: 0.01


In [27]:
##### PARAMETER TUNING FOR YELP_FBOW, NAIVE BAYES, DECISION TREES, LINEAR SVM
parameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 12, 20, 50]
maxf1_nb_fbow, maxp_nb_fbow = testingnb(yelp_valid, yelpt_fbow, yelp_train, parameters, yelpv_fbow)
print("Max F1 for Naive Bayes:",maxf1_nb_fbow)
print("seen at parameter:", maxp_nb_fbow)

maxf1_dt_fbow, maxp_dt_fbow = testingdt(yelp_valid, yelpt_fbow, yelp_train, maxdepths, yelpv_fbow)
print("Max F1 for DecisionTree:",maxf1_dt_fbow)
print("seen at parameter:", maxp_dt_fbow)

parameters_lin = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20, 100, 1000]
maxf1_linear_fbow, maxp_linear_fbow = testinglinear(yelp_valid, yelpt_fbow, yelp_train, parameters_lin, yelpv_fbow)
print("Max F1 for Linear:",maxf1_linear_fbow)
print("seen at parameter:", maxp_linear_fbow)

  'setting alpha = %.1e' % _ALPHA_MIN)


parameter: 0
x 0.38
parameter: 0.0001
x 0.404
parameter: 0.001
x 0.42299999999999993
parameter: 0.01
x 0.426
parameter: 0.1
x 0.411
parameter: 1
x 0.384
parameter: 2
x 0.38499999999999995
parameter: 10
x 0.377
parameter: 12
x 0.378
parameter: 20
x 0.377
parameter: 50
x 0.38499999999999995
Max F1 for Naive Bayes: 0.426
seen at parameter: 0.01
parameter: 1.0
x 0.383
parameter: 2.0
x 0.384
parameter: 3.0
x 0.383
parameter: 4.0
x 0.388
parameter: 5.0
x 0.39200000000000007
parameter: 6.0
x 0.40700000000000003
parameter: 7.0
x 0.41
parameter: 8.0
x 0.425
parameter: 9.0
x 0.425
parameter: 10.0
x 0.42999999999999994
parameter: 11.0
x 0.40599999999999997
parameter: 12.0
x 0.40599999999999997
parameter: 13.0
x 0.395
parameter: 14.0
x 0.368
parameter: 15.0
x 0.353
parameter: 16.0
x 0.35500000000000004
parameter: 17.0
x 0.372
parameter: 18.0
x 0.364
parameter: 19.0
x 0.349
parameter: 20.0
x 0.347
parameter: 21.0
x 0.339
parameter: 22.0
x 0.362
parameter: 23.0
x 0.359
parameter: 24.0
x 0.354
parame

In [None]:
Question 3(c)

In [28]:
###### F1 FOR YELP FBOW
####### Training, validation and test F1-measure 
# Naive bayes F1 using best parameter = 0.1
parameter=maxp_nb_fbow

nb_f1_train_fbow = f1_score(yelp_train[1], naivebayes(yelpt_fbow, yelp_train[1], maxp_nb_fbow, yelpt_fbow),average='micro')
gb_f1_train_fbow = f1_score(yelp_train[1], naivegaus(yelpt_fbow, yelp_train[1], yelpt_fbow), average="micro")


nb_f1_valid_fbow = f1_score(yelp_valid[1], naivebayes(yelpt_fbow, yelp_train[1], maxp_nb_fbow, yelpv_fbow),average='micro')
gb_f1_valid_fbow = f1_score(yelp_valid[1], naivegaus(yelpt_fbow, yelp_train[1], yelpv_fbow), average="micro")

nb_f1_test_fbow = f1_score(yelp_test[1], naivebayes(yelpt_fbow, yelp_train[1], maxp_nb_fbow, yelptest_fbow),average='micro')
gb_f1_test_fbow = f1_score(yelp_test[1], naivegaus(yelpt_fbow, yelp_train[1], yelptest_fbow), average="micro")


dt_f1_train_fbow=f1_score(yelp_train[1], decisiontree_maxd(yelpt_fbow, yelp_train[1], maxp_dt_fbow, yelpt_fbow), average='micro')
dt_f1_valid_fbow=f1_score(yelp_valid[1], decisiontree_maxd(yelpt_fbow, yelp_train[1], maxp_dt_fbow, yelpv_fbow), average='micro')
dt_f1_test_fbow=f1_score(yelp_test[1], decisiontree_maxd(yelpt_fbow, yelp_train[1], maxp_dt_fbow, yelptest_fbow), average='micro')

svm_f1_train_fbow=f1_score(yelp_train[1], linear(yelpt_fbow, yelp_train[1], maxp_linear_fbow, yelpt_fbow),average='micro')
svm_f1_valid_fbow=f1_score(yelp_valid[1], linear(yelpt_fbow, yelp_train[1], maxp_linear_fbow, yelpv_fbow),average='micro')
svm_f1_test_fbow=f1_score(yelp_test[1], linear(yelpt_fbow, yelp_train[1], maxp_linear_fbow, yelptest_fbow),average='micro')

#print("Max F1 for Naive Bayes for training:", nb_f1_train_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for training:", gb_f1_train_fbow)
#print("Max F1 for Naive Bayes for validation:", nb_f1_valid_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for validation:", gb_f1_valid_fbow)
#print("Max F1 for Naive Bayes for testing:", nb_f1_test_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for testing:", gb_f1_test_fbow)

print("Max F1 for Decision Tree for training:", dt_f1_train_fbow)
print("seen at parameter:",maxp_dt_fbow)
print("Max F1 for Decision Tree for validation:", dt_f1_valid_fbow)
print("seen at parameter:",maxp_dt_fbow)
print("Max F1 for Decision Tree for testing:", dt_f1_test_fbow)
print("seen at parameter:",maxp_dt_fbow)

print("Max F1 for SVM for training:", svm_f1_train_fbow)
print("seen at parameter:",maxp_linear_fbow)
print("Max F1 for SVM for validation:", svm_f1_valid_fbow)
print("seen at parameter:",maxp_linear_fbow)
print("Max F1 for SVM for testing:", svm_f1_test_fbow)
print("seen at parameter:",maxp_linear_fbow)

Max F1 for Gaussian Bayes for training: 0.8045714285714286
Max F1 for Gaussian Bayes for validation: 0.295
Max F1 for Gaussian Bayes for testing: 0.31
Max F1 for Decision Tree for training: 0.5428571428571428
seen at parameter: 10.0
Max F1 for Decision Tree for validation: 0.424
seen at parameter: 10.0
Max F1 for Decision Tree for testing: 0.401
seen at parameter: 10.0
Max F1 for SVM for training: 0.6414285714285715
seen at parameter: 10
Max F1 for SVM for validation: 0.501
seen at parameter: 10
Max F1 for SVM for testing: 0.508
seen at parameter: 10


Question 4(a)

In [29]:
# F1 score for IMDB using random classifier 

f1_IMDB= f1_score(IMDB_test[1], randomclass(IMDBt, IMDB_train[1], IMDBtest), average='micro')
print("F1_measure for random classifier for IMDB is:", f1_IMDB)


F1_measure for random classifier for IMDB is: 0.49648


In [None]:
Question 4(b) 

In [30]:
# Parameter testing for linear SVM IMDB

parameters = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
maxf1_linear1, maxp_linear1 = testinglinearIMDB(IMDB_valid, IMDBt, IMDB_train, parameters, IMDBv)
print("Max F1 for Linear:",maxf1_linear1)
print("seen at parameter:", maxp_linear1)


linear parameter: 0.0001
x 0.8317
linear parameter: 0.001
x 0.8675
linear parameter: 0.01
x 0.8746
linear parameter: 0.1
x 0.8568
linear parameter: 1
x 0.8444
linear parameter: 2
x 0.8424000000000001
linear parameter: 10
x 0.8406
linear parameter: 20
x 0.8408
linear parameter: 100
x 0.8396999999999999
linear parameter: 1000
x 0.8408
Max F1 for Linear: 0.8746
seen at parameter: 0.01


In [31]:
# testing validation f1_score for Naive Bayes IMDB 

parameters = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
maxf1_nb1, maxp_nb1 = testingnb(IMDB_valid, IMDBt, IMDB_train, parameters, IMDBv)
print("Max F1 for Naive Bayes:",maxf1_nb1)
print("seen at parameter:", maxp_nb1)


parameter: 0.0001
x 0.8424000000000001
parameter: 0.001
x 0.8427
parameter: 0.01
x 0.8431000000000001
parameter: 0.1
x 0.8436
parameter: 1
x 0.8424000000000001
parameter: 2
x 0.8416999999999999
parameter: 10
x 0.8378
parameter: 20
x 0.8359
parameter: 100
x 0.8115
parameter: 1000
x 0.7209
Max F1 for Naive Bayes: 0.8436
seen at parameter: 0.1


original decision tree (no parameter tuning)

In [32]:
# F1 Score for original decision tree with no hyperparameter tuning 

maxf1_original = f1_score(IMDB_valid[1],decisiontree(IMDBt, IMDB_train[1], IMDBv), average='micro')
print(maxf1_original)

0.6978


Decision Tree (tuning max depth from 1-32)

In [33]:
# testing validation f1_score for decision tree with max depth 
maxdepths = np.linspace(1,32,32,endpoint=True)
maxf1_dt1, maxp_dt1 = testingdt(IMDB_valid, IMDBt, IMDB_train, maxdepths, IMDBv)
print("Max F1 for DecisionTree:",maxf1_dt1)
print("seen at parameter:", maxp_dt1)

parameter: 1.0
x 0.6113
parameter: 2.0
x 0.6453
parameter: 3.0
x 0.6664
parameter: 4.0
x 0.6689
parameter: 5.0
x 0.6815
parameter: 6.0
x 0.6957
parameter: 7.0
x 0.7053
parameter: 8.0
x 0.7045
parameter: 9.0
x 0.7046
parameter: 10.0
x 0.7106999999999999
parameter: 11.0
x 0.712
parameter: 12.0
x 0.7205999999999999
parameter: 13.0
x 0.7207
parameter: 14.0
x 0.7242
parameter: 15.0
x 0.7239
parameter: 16.0
x 0.7217000000000001


KeyboardInterrupt: 

Decision Tree (Min Sample Split tuning from 0.1-1.0)

In [34]:
# testing validation f1_score for decision tree with min sample split
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
maxf1mss_dt, maxpmss_dt = testingdtmss(IMDB_valid, IMDBt, IMDB_train, min_samples_split, IMDBv)
print("Max F1 for Decision Tree:",maxf1mss_dt)
print("seen at parameter:", maxpmss_dt)

min_sample: 0.1
x 0.721465416415963
min_sample: 0.2
x 0.6980133308372874
min_sample: 0.30000000000000004
x 0.6980133308372874
min_sample: 0.4
x 0.6980133308372874
min_sample: 0.5
x 0.6691405945531125
min_sample: 0.6
x 0.653336891326543
min_sample: 0.7000000000000001
x 0.653336891326543
min_sample: 0.8
x 0.580417154295444
min_sample: 0.9
x 0.580417154295444
min_sample: 1.0
x 0.580417154295444
Max F1 for Decision Tree: 0.721465416415963
seen at parameter: 0.1


Decision Tree (Min Sample Leaf tuning from 0.1 - 0.5)

In [35]:
# Parameter tuning for decision tree with min sample leaf
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
maxf1msl_dt, maxpmsl_dt = testingdtmsl(IMDB_valid, IMDBt, IMDB_train,min_samples_leaf, IMDBv)
print("Max F1 for Decision Tree:",maxf1msl_dt)
print("seen at parameter:", maxpmsl_dt)


min_sample: 0.1
x 0.6492301383028154
min_sample: 0.2
x 0.6303876794396931
min_sample: 0.30000000000000004
x 0.558060448739638
min_sample: 0.4
x 0.5524427902976884
min_sample: 0.5
x 0.3333333333333333
Max F1 for Decision Tree: 0.6492301383028154
seen at parameter: 0.1


  'precision', 'predicted', average, warn_for)


for i in enumerate(parameters): 
    x=f1_score(IMDB_valid[1], naivegaus(IMDBt, IMDB_train[1], i, IMDBv),average='weighted')
    #x1=f1_score(IMDB_valid[1], naivegaus(IMDBt, IMDB_train[1], i, IMDBv),average='micro')
    #x2=f1_score(IMDB_valid[1], naivegaus(IMDBt, IMDB_train[1], i, IMDBv),average='macro')
    #x3=f1_score(IMDB_valid[1], naivegaus(IMDBt, IMDB_train[1], i, IMDBv),average='samples')
    print('ng1',x)
    #print('ng2',x1)
    #print('ng3', x2)
    #print('ng4',x3)

# SAMPLES 
for i in range(len(parameters)): 
    #x=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='weighted')
    #x1=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='micro')
    #x2=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], i, IMDBv),average='macro')
    x3=f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], parameters[i], IMDBv),average='samples')
    #print('x',x)
    #print('x1',x1)
    #print('x2', x2)
    print("parameters",parameters[i])
    print('x3',x3)

Question 4(d)

In [37]:
#maxp_dt1=14
####### Training, validation and test F1-measure 
# IMDB BBOW
# Naive bayes F1 using best parameter = 0.1
nb_f1_train = f1_score(IMDB_train[1], naivebayes(IMDBt, IMDB_train[1], maxp_nb1, IMDBt),average='micro')
gb_f1_train = f1_score(IMDB_train[1], naivegaus(IMDBt, IMDB_train[1], IMDBt), average="micro")


nb_f1_valid = f1_score(IMDB_valid[1], naivebayes(IMDBt, IMDB_train[1], maxp_nb1, IMDBv),average='micro')
gb_f1_valid = f1_score(IMDB_valid[1], naivegaus(IMDBt, IMDB_train[1], IMDBv), average="micro")

nb_f1_test = f1_score(IMDB_test[1], naivebayes(IMDBt, IMDB_train[1], maxp_nb1, IMDBtest),average='micro')
gb_f1_test = f1_score(IMDB_test[1], naivegaus(IMDBt, IMDB_train[1], IMDBtest), average="micro")


dt_f1_train=f1_score(IMDB_train[1], decisiontree_maxd(IMDBt, IMDB_train[1],  maxp_dt1, IMDBt), average='micro')
dt_f1_valid=f1_score(IMDB_valid[1], decisiontree_maxd(IMDBt, IMDB_train[1],  maxp_dt1, IMDBv), average='micro')
dt_f1_test=f1_score(IMDB_test[1], decisiontree_maxd(IMDBt, IMDB_train[1],  maxp_dt1, IMDBtest), average='micro')

svm_f1_train=f1_score(IMDB_train[1], linearIMDB(IMDBt, IMDB_train[1], maxp_linear1, IMDBt),average='micro')
svm_f1_valid=f1_score(IMDB_valid[1], linearIMDB(IMDBt, IMDB_train[1], maxp_linear1, IMDBv),average='micro')
svm_f1_test=f1_score(IMDB_test[1], linearIMDB(IMDBt, IMDB_train[1], maxp_linear1, IMDBtest),average='micro')

print("Max F1 for Naive Bayes for training:", nb_f1_train)
print("seen at parameter:", maxp_nb1)
#print("Max F1 for Gaussian Bayes for training:", gb_f1_train)
print("Max F1 for Naive Bayes for validation:", nb_f1_valid)
print("seen at parameter:", maxp_nb1)
#print("Max F1 for Gaussian Bayes for validation:", gb_f1_valid)
print("Max F1 for Naive Bayes for testing:", nb_f1_test)
print("seen at parameter:", maxp_nb1)
#print("Max F1 for Gaussian Bayes for testing:", gb_f1_test)

print("Max F1 for Decision Tree for training:", dt_f1_train)
print("seen at parameter:",maxp_dt1)
print("Max F1 for Decision Tree for validation:", dt_f1_valid)
print("seen at parameter:",maxp_dt1)
print("Max F1 for Decision Tree for testing:", dt_f1_test)
print("seen at parameter:",maxp_dt1)

print("Max F1 for SVM for training:", svm_f1_train)
print("seen at parameter:",maxp_linear1)
print("Max F1 for SVM for validation:", svm_f1_valid)
print("seen at parameter:",maxp_linear1)
print("Max F1 for SVM for testing:", svm_f1_test)
print("seen at parameter:",maxp_linear1)

Max F1 for Naive Bayes for training: 0.8709333333333333
seen at parameter: 0.1
Max F1 for Naive Bayes for validation: 0.8436
seen at parameter: 0.1
Max F1 for Naive Bayes for testing: 0.83176
seen at parameter: 0.1
Max F1 for Decision Tree for training: 0.8117333333333333
seen at parameter: 14
Max F1 for Decision Tree for validation: 0.7226
seen at parameter: 14
Max F1 for Decision Tree for testing: 0.72664
seen at parameter: 14
Max F1 for SVM for training: 0.9632666666666667
seen at parameter: 0.01
Max F1 for SVM for validation: 0.8746
seen at parameter: 0.01
Max F1 for SVM for testing: 0.86924
seen at parameter: 0.01


Question 5

In [39]:
##### PARAMETER TUNING FOR IMDB_FBOW NAIVE BAYES, DECISION TREE, LINEAR SVM
parameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 12, 20, 50]
maxf1_nb_fbow, maxp_nb_fbow = testingnb(IMDB_valid, IMDBt_fbow, IMDB_train, parameters, IMDBv_fbow)
print("Max F1 for Naive Bayes:",maxf1_nb_fbow)
print("seen at parameter:", maxp_nb_fbow)

maxf1_dt, maxp_dt = testingdt(IMDB_valid, IMDBt_fbow, IMDB_train, maxdepths, IMDBv_fbow)
print("Max F1 for DecisionTree:",maxf1_dt_fbow)
print("seen at parameter:", maxp_dt_fbow)

parameters_lin = [0.0001, 0.001, 0.01, 0.1, 1, 2, 10, 20,100, 1000]
maxf1_linear_fbow, maxp_linear_fbow = testinglinearIMDB(IMDB_valid, IMDBt_fbow, IMDB_train, parameters_lin, IMDBv_fbow)
print("Max F1 for Linear:",maxf1_linear_fbow)
print("seen at parameter:", maxp_linear_fbow)


  'setting alpha = %.1e' % _ALPHA_MIN)


parameter: 0
x 0.8415
parameter: 0.0001
x 0.8424000000000001
parameter: 0.001
x 0.8427
parameter: 0.01
x 0.8431000000000001
parameter: 0.1
x 0.8436
parameter: 1
x 0.8424000000000001
parameter: 2
x 0.8416999999999999
parameter: 10
x 0.8378
parameter: 12
x 0.8378
parameter: 20
x 0.8359
parameter: 50
x 0.8248999999999999
Max F1 for Naive Bayes: 0.8436
seen at parameter: 0.1
parameter: 1.0
x 0.6118
parameter: 2.0
x 0.6469
parameter: 3.0
x 0.6675
parameter: 4.0
x 0.6827
parameter: 5.0
x 0.6853
parameter: 6.0
x 0.6895
parameter: 7.0
x 0.6944
parameter: 8.0
x 0.7036
parameter: 9.0
x 0.7106
parameter: 10.0
x 0.7142
parameter: 11.0
x 0.7198
parameter: 12.0
x 0.7183
parameter: 13.0
x 0.7221
parameter: 14.0
x 0.7195
parameter: 15.0
x 0.7158999999999999
parameter: 16.0
x 0.7144
parameter: 17.0
x 0.7130999999999998
parameter: 18.0
x 0.7164
parameter: 19.0
x 0.7145
parameter: 20.0
x 0.7133
parameter: 21.0
x 0.7168
parameter: 22.0
x 0.7154
parameter: 23.0
x 0.7123
parameter: 24.0
x 0.7152
parameter: 

In [40]:
svm_f1_train_fbow=f1_score(IMDB_train[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBt_fbow),average='micro')
svm_f1_valid_fbow=f1_score(IMDB_valid[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBv_fbow),average='micro')
svm_f1_test_fbow=f1_score(IMDB_test[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBtest_fbow),average='micro')
print("Max F1 for SVM for training:", svm_f1_train_fbow)
print("seen at parameter:", maxp_linear_fbow)
print("Max F1 for SVM for validation:", svm_f1_valid_fbow)
print("seen at parameter:",maxp_linear_fbow)
print("Max F1 for SVM for testing:", svm_f1_test_fbow)
print("seen at parameter:",maxp_linear_fbow)

Max F1 for SVM for training: 0.938
seen at parameter: 100
Max F1 for SVM for validation: 0.8778
seen at parameter: 100
Max F1 for SVM for testing: 0.87416
seen at parameter: 100


In [41]:
#maxp_linear_fbow=100
###### F1 FOR FBOW IMDB
####### Training, validation and test F1-measure 
# Naive bayes F1 using best parameter = 0.1
nb_f1_train_fbow = f1_score(IMDB_train[1], naivebayes(IMDBt_fbow, IMDB_train[1], maxp_nb_fbow, IMDBt_fbow),average='micro')
gb_f1_train_fbow = f1_score(IMDB_train[1], naivegaus(IMDBt_fbow, IMDB_train[1], IMDBt_fbow), average="micro")


nb_f1_valid_fbow = f1_score(IMDB_valid[1], naivebayes(IMDBt_fbow, IMDB_train[1], maxp_nb_fbow, IMDBv_fbow),average='micro')
gb_f1_valid_fbow = f1_score(IMDB_valid[1], naivegaus(IMDBt_fbow, IMDB_train[1], IMDBv_fbow), average="micro")

nb_f1_test_fbow = f1_score(IMDB_test[1], naivebayes(IMDBt_fbow, IMDB_train[1], maxp_nb_fbow, IMDBtest_fbow),average='micro')
gb_f1_test_fbow = f1_score(IMDB_test[1], naivegaus(IMDBt_fbow, IMDB_train[1], IMDBtest_fbow), average="micro")


dt_f1_train_fbow=f1_score(IMDB_train[1], decisiontree_maxd(IMDBt_fbow, IMDB_train[1], maxp_dt_fbow, IMDBt_fbow), average='micro')
dt_f1_valid_fbow=f1_score(IMDB_valid[1], decisiontree_maxd(IMDBt_fbow, IMDB_train[1], maxp_dt_fbow, IMDBv_fbow), average='micro')
dt_f1_test_fbow=f1_score(IMDB_test[1], decisiontree_maxd(IMDBt_fbow, IMDB_train[1], maxp_dt_fbow, IMDBtest_fbow), average='micro')

svm_f1_train_fbow=f1_score(IMDB_train[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBt_fbow),average='micro')
svm_f1_valid_fbow=f1_score(IMDB_valid[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBv_fbow),average='micro')
svm_f1_test_fbow=f1_score(IMDB_test[1], linearIMDB(IMDBt_fbow, IMDB_train[1], maxp_linear_fbow, IMDBtest_fbow),average='micro')

#print("Max F1 for Naive Bayes for training:", nb_f1_train_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for training:", gb_f1_train_fbow)
#print("Max F1 for Naive Bayes for validation:", nb_f1_valid_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for validation:", gb_f1_valid_fbow)
#print("Max F1 for Naive Bayes for testing:", nb_f1_test_fbow)
#print("seen at parameter:", maxp_nb_fbow)
print("Max F1 for Gaussian Bayes for testing:", gb_f1_test_fbow)

print("Max F1 for Decision Tree for training:", dt_f1_train_fbow)
print("seen at parameter:",maxp_dt_fbow)
print("Max F1 for Decision Tree for validation:", dt_f1_valid_fbow)
print("seen at parameter:",maxp_dt_fbow)
print("Max F1 for Decision Tree for testing:", dt_f1_test_fbow)
print("seen at parameter:",maxp_dt_fbow)

print("Max F1 for SVM for training:", svm_f1_train_fbow)
print("seen at parameter:", maxp_linear_fbow)
print("Max F1 for SVM for validation:", svm_f1_valid_fbow)
print("seen at parameter:",maxp_linear_fbow)
print("Max F1 for SVM for testing:", svm_f1_test_fbow)
print("seen at parameter:",maxp_linear_fbow)

Max F1 for Gaussian Bayes for training: 0.8617333333333334
Max F1 for Gaussian Bayes for validation: 0.7508000000000001
Max F1 for Gaussian Bayes for testing: 0.68908
Max F1 for Decision Tree for training: 0.7662666666666667
seen at parameter: 10.0
Max F1 for Decision Tree for validation: 0.7141
seen at parameter: 10.0
Max F1 for Decision Tree for testing: 0.71024
seen at parameter: 10.0
Max F1 for SVM for training: 0.938
seen at parameter: 100
Max F1 for SVM for validation: 0.8778
seen at parameter: 100
Max F1 for SVM for testing: 0.87416
seen at parameter: 100
