In [7]:
import pandas as pd
import numpy as np
import pickle
import random
from scipy.sparse import *

# Compute GloVe Word Embeddings

In [8]:
print("loading cooccurrence matrix")
with open('cooc.pkl', 'rb') as f:
        cooc = pickle.load(f)
print("{} nonzero entries".format(cooc.nnz))

loading cooccurrence matrix
6496907 nonzero entries


In [9]:
nmax = 100
print("using nmax =", nmax, ", cooc.max() =", cooc.max())

using nmax = 100 , cooc.max() = 207302


In [10]:
print("initializing embeddings")
embedding_dim = 25
xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

initializing embeddings


In [11]:
epochs = 20

In [12]:
eta = 0.001
alpha = 3 / 4

In [13]:
for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            logn = np.log(n)
            fn = min(1.0, (n / nmax) ** alpha)
            x, y = xs[ix, :], ys[jy, :]
            scale = 2 * eta * fn * (logn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x
			# fill in your SGD code here,
			# for the update resulting from co-occurence (i,j)


epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19


In [14]:
np.savez('embeddings', xs, ys)

In [6]:
 npzfile  = np.load('embeddings.npz')

In [7]:
xs = npzfile['arr_0']

In [8]:
ys = npzfile['arr_1']

In [15]:
with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
vocab_size = len(vocab)

In [16]:
xsys =xs+ys

In [17]:
#average word embeddings for each sentence in positive file 
#create "sentence embeddings by averaging every word in each sentence" 

In [18]:
pos_train = []

In [20]:
with open('train_pos.txt') as file:
    for line in file:
        tmp = np.zeros(embedding_dim)
        count = 0
        for val in line.strip().split():
            IDX = vocab.get(val, -1)
            if(IDX!= -1):
                tmp+=xsys[IDX]
                count+=1
        if(count!= 0):
            tmp = tmp/count
            pos_train.append(tmp)

In [21]:
pos_train_arr = np.array(pos_train)

In [22]:
pos_train_arr.shape

(99995, 25)

In [25]:
neg_train = []

In [26]:
with open('train_neg.txt') as file:
    for line in file:
        tmp = np.zeros(embedding_dim)
        count = 0
        for val in line.strip().split():
            IDX = vocab.get(val, -1)
            if(IDX!= -1):
                tmp+=xsys[IDX]
                count+=1
        if(count!= 0):
            tmp = tmp/count
            neg_train.append(tmp)

In [27]:
neg_train_arr = np.array(neg_train)

In [28]:
neg_train_arr.shape

(99996, 25)

In [30]:
poslabels = np.repeat(1,99995)

In [31]:
neglabels = np.repeat(-1,99996)

In [32]:
labels = np.append(poslabels,neglabels)

In [33]:
training_set = np.concatenate((pos_train_arr, neg_train_arr))

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Different Classifiers

In [48]:
X_train, x_test, y_train, y_test = train_test_split(training_set, labels, test_size=0.3, random_state=42)

In [39]:
gnb = GaussianNB()

In [41]:
y_pred = gnb.fit(X_train, y_train).predict(x_test)

In [44]:
print( classification_report(y_test, y_pred) )

             precision    recall  f1-score   support

         -1       0.64      0.49      0.55     30142
          1       0.58      0.72      0.65     29856

avg / total       0.61      0.60      0.60     59998



In [46]:
print( accuracy_score(y_test, y_pred))

0.6048701623387446


In [81]:
clf = RandomForestClassifier(156,n_jobs =1)

In [82]:
clf.fit(X_train,  y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=156, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [83]:
preds= clf.predict(x_test)

In [84]:
print( classification_report(y_test, preds) )

             precision    recall  f1-score   support

         -1       0.70      0.57      0.63     30142
          1       0.64      0.75      0.69     29856

avg / total       0.67      0.66      0.66     59998



In [85]:
print( accuracy_score(y_test, preds))

0.6608886962898763


In [26]:
#svc.fit(x_train, y_train)

In [42]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [37]:
logreg = LogisticRegression()

In [38]:
logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
predictions = logreg.predict(x_test)

             precision    recall  f1-score   support

         -1       0.64      0.49      0.55     30142
          1       0.58      0.72      0.65     29856

avg / total       0.61      0.60      0.60     59998



In [41]:
print( accuracy_score(y_pred, predictions))

0.5905394432095917


In [46]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1)
clf.fit(x_train, y_train)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(15,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [48]:
preds = clf.predict(x_test)

In [50]:
print( accuracy_score(y_test, preds))

0.6085199946131682


In [84]:
#not so great :()

# Now, test data from competition

In [86]:
test_words = []

In [87]:
with open('test_data.txt') as file:
    for line in file:
        tmp = np.zeros(embedding_dim)
        count = 0
        for val in line.strip().split():
            IDX = vocab.get(val, -1)
            if(IDX!= -1):
                tmp+=xsys[IDX]
                count+=1
        if(count == 0):
            count = 1
        tmp = tmp/count
        test_words.append(tmp)

In [88]:
test_embeddings = np.array(test_words)

In [89]:
test_predictions = clf.predict(test_embeddings)

In [90]:
id_count = 1 
output_arr = []
for val in test_predictions:
    print(id_count, val)
    output_arr.append([id_count, val])
    id_count+=1

1 -1
2 1
3 -1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 -1
12 1
13 1
14 1
15 1
16 1
17 -1
18 1
19 1
20 -1
21 -1
22 -1
23 -1
24 -1
25 1
26 -1
27 1
28 -1
29 -1
30 1
31 1
32 1
33 -1
34 1
35 1
36 1
37 1
38 -1
39 -1
40 1
41 1
42 -1
43 -1
44 1
45 1
46 -1
47 1
48 1
49 1
50 -1
51 1
52 1
53 1
54 -1
55 1
56 1
57 -1
58 1
59 -1
60 -1
61 1
62 1
63 1
64 -1
65 1
66 1
67 1
68 -1
69 -1
70 -1
71 1
72 1
73 -1
74 1
75 -1
76 1
77 -1
78 -1
79 -1
80 1
81 -1
82 -1
83 1
84 -1
85 -1
86 -1
87 1
88 1
89 -1
90 1
91 1
92 1
93 -1
94 1
95 1
96 -1
97 -1
98 1
99 1
100 1
101 -1
102 1
103 1
104 -1
105 1
106 -1
107 -1
108 -1
109 1
110 -1
111 -1
112 -1
113 -1
114 1
115 -1
116 1
117 1
118 -1
119 1
120 1
121 1
122 1
123 -1
124 -1
125 1
126 -1
127 -1
128 1
129 1
130 1
131 1
132 1
133 -1
134 -1
135 -1
136 -1
137 1
138 1
139 1
140 1
141 1
142 1
143 -1
144 1
145 -1
146 -1
147 1
148 1
149 1
150 1
151 1
152 1
153 1
154 1
155 -1
156 1
157 1
158 1
159 -1
160 -1
161 1
162 -1
163 -1
164 1
165 1
166 1
167 1
168 1
169 -1
170 -1
171 -1
172 -1
173 

1936 1
1937 -1
1938 -1
1939 -1
1940 1
1941 -1
1942 1
1943 -1
1944 -1
1945 -1
1946 1
1947 -1
1948 1
1949 -1
1950 1
1951 1
1952 1
1953 1
1954 1
1955 -1
1956 1
1957 1
1958 -1
1959 1
1960 1
1961 1
1962 1
1963 1
1964 1
1965 1
1966 1
1967 1
1968 1
1969 -1
1970 1
1971 -1
1972 -1
1973 -1
1974 -1
1975 -1
1976 1
1977 -1
1978 1
1979 1
1980 -1
1981 1
1982 1
1983 1
1984 1
1985 -1
1986 -1
1987 -1
1988 1
1989 1
1990 -1
1991 1
1992 -1
1993 -1
1994 -1
1995 1
1996 1
1997 -1
1998 1
1999 1
2000 -1
2001 -1
2002 -1
2003 -1
2004 1
2005 1
2006 1
2007 1
2008 -1
2009 -1
2010 1
2011 1
2012 -1
2013 -1
2014 1
2015 -1
2016 1
2017 -1
2018 -1
2019 1
2020 -1
2021 1
2022 -1
2023 1
2024 -1
2025 -1
2026 -1
2027 -1
2028 1
2029 -1
2030 -1
2031 1
2032 1
2033 -1
2034 1
2035 1
2036 -1
2037 -1
2038 -1
2039 1
2040 -1
2041 1
2042 1
2043 -1
2044 1
2045 1
2046 1
2047 1
2048 -1
2049 -1
2050 -1
2051 -1
2052 1
2053 1
2054 -1
2055 1
2056 -1
2057 1
2058 1
2059 -1
2060 -1
2061 -1
2062 -1
2063 1
2064 -1
2065 1
2066 1
2067 1
2068 1
2069 -

4185 -1
4186 -1
4187 -1
4188 1
4189 1
4190 -1
4191 -1
4192 -1
4193 -1
4194 1
4195 1
4196 1
4197 1
4198 -1
4199 1
4200 1
4201 1
4202 1
4203 -1
4204 -1
4205 -1
4206 -1
4207 1
4208 -1
4209 -1
4210 -1
4211 -1
4212 -1
4213 1
4214 1
4215 1
4216 1
4217 -1
4218 1
4219 1
4220 1
4221 1
4222 -1
4223 1
4224 -1
4225 1
4226 -1
4227 -1
4228 1
4229 -1
4230 -1
4231 -1
4232 1
4233 1
4234 1
4235 -1
4236 1
4237 -1
4238 -1
4239 1
4240 1
4241 -1
4242 1
4243 1
4244 1
4245 -1
4246 -1
4247 -1
4248 -1
4249 1
4250 1
4251 1
4252 1
4253 1
4254 1
4255 1
4256 1
4257 1
4258 1
4259 1
4260 1
4261 1
4262 1
4263 1
4264 -1
4265 1
4266 1
4267 -1
4268 1
4269 1
4270 -1
4271 -1
4272 -1
4273 1
4274 1
4275 1
4276 1
4277 1
4278 -1
4279 1
4280 1
4281 -1
4282 -1
4283 -1
4284 1
4285 1
4286 1
4287 1
4288 -1
4289 -1
4290 1
4291 -1
4292 1
4293 1
4294 1
4295 -1
4296 1
4297 -1
4298 -1
4299 1
4300 1
4301 -1
4302 1
4303 1
4304 -1
4305 -1
4306 1
4307 1
4308 1
4309 -1
4310 -1
4311 -1
4312 1
4313 -1
4314 1
4315 -1
4316 -1
4317 -1
4318 -1
431

5435 1
5436 1
5437 -1
5438 -1
5439 1
5440 1
5441 -1
5442 1
5443 1
5444 1
5445 1
5446 1
5447 1
5448 -1
5449 1
5450 -1
5451 -1
5452 1
5453 1
5454 1
5455 1
5456 -1
5457 -1
5458 -1
5459 -1
5460 -1
5461 1
5462 1
5463 1
5464 1
5465 1
5466 -1
5467 1
5468 -1
5469 -1
5470 -1
5471 1
5472 1
5473 1
5474 1
5475 -1
5476 -1
5477 -1
5478 1
5479 -1
5480 1
5481 1
5482 1
5483 1
5484 1
5485 1
5486 1
5487 1
5488 1
5489 1
5490 1
5491 1
5492 -1
5493 -1
5494 -1
5495 -1
5496 -1
5497 -1
5498 -1
5499 -1
5500 -1
5501 -1
5502 1
5503 -1
5504 -1
5505 1
5506 1
5507 -1
5508 1
5509 -1
5510 1
5511 1
5512 1
5513 1
5514 1
5515 1
5516 -1
5517 1
5518 1
5519 1
5520 -1
5521 1
5522 1
5523 -1
5524 -1
5525 -1
5526 -1
5527 -1
5528 1
5529 -1
5530 1
5531 -1
5532 -1
5533 1
5534 1
5535 1
5536 1
5537 -1
5538 -1
5539 1
5540 -1
5541 -1
5542 1
5543 -1
5544 -1
5545 1
5546 1
5547 1
5548 -1
5549 -1
5550 1
5551 -1
5552 1
5553 1
5554 -1
5555 -1
5556 1
5557 -1
5558 -1
5559 -1
5560 1
5561 1
5562 -1
5563 1
5564 1
5565 1
5566 -1
5567 -1
5568 1
55

7685 1
7686 -1
7687 -1
7688 1
7689 -1
7690 1
7691 1
7692 1
7693 1
7694 -1
7695 1
7696 -1
7697 1
7698 -1
7699 -1
7700 -1
7701 -1
7702 1
7703 1
7704 1
7705 -1
7706 -1
7707 1
7708 -1
7709 1
7710 -1
7711 1
7712 1
7713 -1
7714 -1
7715 1
7716 -1
7717 1
7718 1
7719 1
7720 1
7721 -1
7722 1
7723 1
7724 1
7725 -1
7726 -1
7727 1
7728 -1
7729 1
7730 1
7731 1
7732 1
7733 -1
7734 -1
7735 1
7736 1
7737 -1
7738 1
7739 -1
7740 -1
7741 -1
7742 -1
7743 1
7744 1
7745 1
7746 1
7747 1
7748 -1
7749 -1
7750 1
7751 -1
7752 -1
7753 -1
7754 1
7755 1
7756 -1
7757 1
7758 1
7759 1
7760 -1
7761 -1
7762 1
7763 1
7764 -1
7765 1
7766 1
7767 -1
7768 -1
7769 1
7770 -1
7771 -1
7772 -1
7773 -1
7774 -1
7775 1
7776 -1
7777 -1
7778 -1
7779 1
7780 1
7781 -1
7782 -1
7783 -1
7784 -1
7785 -1
7786 -1
7787 -1
7788 1
7789 -1
7790 -1
7791 1
7792 -1
7793 -1
7794 1
7795 -1
7796 1
7797 1
7798 -1
7799 -1
7800 -1
7801 -1
7802 1
7803 -1
7804 1
7805 1
7806 1
7807 1
7808 1
7809 -1
7810 1
7811 -1
7812 1
7813 1
7814 -1
7815 -1
7816 -1
7817 -1


8934 1
8935 1
8936 1
8937 1
8938 1
8939 -1
8940 1
8941 1
8942 1
8943 1
8944 -1
8945 -1
8946 1
8947 1
8948 -1
8949 1
8950 1
8951 1
8952 1
8953 -1
8954 -1
8955 1
8956 -1
8957 1
8958 -1
8959 -1
8960 1
8961 1
8962 -1
8963 1
8964 1
8965 1
8966 -1
8967 -1
8968 -1
8969 1
8970 -1
8971 1
8972 -1
8973 1
8974 1
8975 1
8976 1
8977 -1
8978 1
8979 1
8980 1
8981 1
8982 1
8983 1
8984 -1
8985 1
8986 -1
8987 1
8988 1
8989 -1
8990 1
8991 1
8992 1
8993 1
8994 1
8995 -1
8996 1
8997 1
8998 -1
8999 -1
9000 -1
9001 -1
9002 1
9003 1
9004 -1
9005 1
9006 -1
9007 1
9008 -1
9009 1
9010 1
9011 1
9012 1
9013 1
9014 1
9015 -1
9016 -1
9017 -1
9018 1
9019 1
9020 1
9021 -1
9022 1
9023 -1
9024 -1
9025 1
9026 -1
9027 1
9028 -1
9029 1
9030 -1
9031 1
9032 1
9033 -1
9034 -1
9035 1
9036 1
9037 -1
9038 -1
9039 1
9040 1
9041 -1
9042 -1
9043 1
9044 -1
9045 -1
9046 1
9047 -1
9048 -1
9049 -1
9050 -1
9051 -1
9052 -1
9053 1
9054 1
9055 1
9056 1
9057 1
9058 1
9059 1
9060 1
9061 -1
9062 1
9063 1
9064 -1
9065 1
9066 -1
9067 -1
9068 -1


In [91]:
output_df = pd.DataFrame(np.array(output_arr))

In [92]:
output_df.columns=["Id", "Prediction"]

In [93]:
output_df.set_index('Id', inplace=True)

In [94]:
output_df.to_csv("Predictions1.csv")