### General Information

Our options:  Neural Network, LTSM, Logistic Regression, SVM, etc.



In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [2]:
dataset= dataset[dataset['grades'] != 0]
dataset.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
5,8832,All 22 <promises/> Trump made in his speech to...,sounds,22200,1.2


In [3]:
dataset = dataset.drop(['id', 'grades'], axis=1)
dataset.head()

Unnamed: 0,original,edit,meanGrade
0,France is ‘ hunting down its citizens who join...,twins,0.2
1,"Pentagon claims 2,000 % increase in Russian tr...",bowling,1.6
2,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,1.0
3,"In an apparent first , Iran and Israel <engage...",slap,0.4
5,All 22 <promises/> Trump made in his speech to...,sounds,1.2


In [4]:
import re

dataset['replaced_sentence'] = ""
storage_array = []

for index, row in dataset.iterrows():
    #print(row['edit'])
    new = re.sub('<.*/>', row['edit'], row['original'], flags=re.DOTALL)
    storage_array.append(new)

dataset['replaced_sentence'] = storage_array
dataset.head()

Unnamed: 0,original,edit,meanGrade,replaced_sentence
0,France is ‘ hunting down its citizens who join...,twins,0.2,France is ‘ hunting down its citizens who join...
1,"Pentagon claims 2,000 % increase in Russian tr...",bowling,1.6,"Pentagon claims 2,000 % increase in Russian tr..."
2,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,1.0,Iceland PM Calls Snap Vote as Pedophile Furor ...
3,"In an apparent first , Iran and Israel <engage...",slap,0.4,"In an apparent first , Iran and Israel slap ea..."
5,All 22 <promises/> Trump made in his speech to...,sounds,1.2,All 22 sounds Trump made in his speech to Cong...


In [5]:
## Convert to lowercase
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x.lower() for x in x.split()))
dataset['replaced_sentence'].head()

0    france is ‘ hunting down its citizens who join...
1    pentagon claims 2,000 % increase in russian tr...
2    iceland pm calls snap vote as pedophile furor ...
3    in an apparent first , iran and israel slap ea...
5    all 22 sounds trump made in his speech to cong...
Name: replaced_sentence, dtype: object

In [6]:
# Remove Stop Words

from nltk.corpus import stopwords
stop = stopwords.words('english')
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
dataset['replaced_sentence'].head()

0    france ‘ hunting citizens joined twins ’ witho...
1    pentagon claims 2,000 % increase russian troll...
2    iceland pm calls snap vote pedophile furor cra...
3         apparent first , iran israel slap militarily
5     22 sounds trump made speech congress , one chart
Name: replaced_sentence, dtype: object

In [7]:
# Remove punctuation
dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('[^\w\s]','')
dataset['replaced_sentence'].head()

0    france  hunting citizens joined twins  without...
1    pentagon claims 2000  increase russian trolls ...
2    iceland pm calls snap vote pedophile furor cra...
3          apparent first  iran israel slap militarily
5      22 sounds trump made speech congress  one chart
Name: replaced_sentence, dtype: object

In [8]:
# Check for common words
frequent_words = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[:10]
frequent_words

trump     3238
s         2218
us         709
says       597
house      479
donald     348
new        345
nt         344
white      330
gop        286
dtype: int64

In [9]:
# Remove 's' and'nt' (consider replacing )
words_to_remove = ['s', 'nt']
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))

In [10]:
#check for rare words
rare = pd.Series(' '.join(dataset['replaced_sentence']).split()).value_counts()[-10:]
rare

ham            1
geckos         1
sailboat       1
rebuttal       1
rectal         1
closets        1
caterpillar    1
brick          1
fabrics        1
disposed       1
dtype: int64

In [11]:
# remove rare words
rare = list(rare.index)
dataset['replaced_sentence'] = dataset['replaced_sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))

In [12]:
# spell correction
import textblob
from textblob import TextBlob
dataset['replaced_sentence'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    france hunting citizens joined twins without t...
1    pentagon claims 2000 increase russian trills b...
2    iceland pm calls snap vote pedophile for clash...
3              apparent first ran israel slap military
5       22 sounds tramp made speech congress one chart
Name: replaced_sentence, dtype: object

In [13]:
## remove digits
from string import digits
# 

dataset['replaced_sentence'] = dataset['replaced_sentence'].str.replace('\d+', '')
dataset.head()

Unnamed: 0,original,edit,meanGrade,replaced_sentence
0,France is ‘ hunting down its citizens who join...,twins,0.2,france hunting citizens joined twins without t...
1,"Pentagon claims 2,000 % increase in Russian tr...",bowling,1.6,pentagon claims increase russian trolls bowli...
2,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,1.0,iceland pm calls snap vote pedophile furor cra...
3,"In an apparent first , Iran and Israel <engage...",slap,0.4,apparent first iran israel slap militarily
5,All 22 <promises/> Trump made in his speech to...,sounds,1.2,sounds trump made speech congress one chart


In [14]:
## Tokenization of sentences. making each sentence into a list.
# wikipedia embeddings trained over all of wikipedia. 
# convolution over the word embeddings (i.e. average it)

import nltk

tokenized = [nltk.word_tokenize(sent) for sent in dataset['replaced_sentence']]
dataset['tokenized'] = tokenized

dataset.head()

Unnamed: 0,original,edit,meanGrade,replaced_sentence,tokenized
0,France is ‘ hunting down its citizens who join...,twins,0.2,france hunting citizens joined twins without t...,"[france, hunting, citizens, joined, twins, wit..."
1,"Pentagon claims 2,000 % increase in Russian tr...",bowling,1.6,pentagon claims increase russian trolls bowli...,"[pentagon, claims, increase, russian, trolls, ..."
2,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,1.0,iceland pm calls snap vote pedophile furor cra...,"[iceland, pm, calls, snap, vote, pedophile, fu..."
3,"In an apparent first , Iran and Israel <engage...",slap,0.4,apparent first iran israel slap militarily,"[apparent, first, iran, israel, slap, militarily]"
5,All 22 <promises/> Trump made in his speech to...,sounds,1.2,sounds trump made speech congress one chart,"[sounds, trump, made, speech, congress, one, c..."


In [15]:
from gensim.models import KeyedVectors
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# Access vectors for specific words with a keyed lookup:
vector = model['easy']
# see the shape of the vector (300,)
vector.shape
# Processing sentences is not as simple as with Spacy:
vectors = [model[x] for x in "This is some text I am processing with Spacy".split(' ')]

In [None]:
#model.similarity("sparkle", "glitter")
vectors_array = []
print(type(model))

if "doj" in model:
    print("it is!")

#if a word from the sentence exists in the model

for index, row in dataset.iterrows(): 
    each_sentence = []
    for x in row['tokenized']:
        if x in model:
            vectors = model[x]
            each_sentence.append(vectors)
    print(each_sentence)
    vectors_array.append(each_sentence)
    
print(len(vectors_array))
            

dataset['word embeddings'] = vectors_array




### Building a Model

### Testing the Model