## Loading a Dataset

In [1]:
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

import pandas as pd

df = pd.read_csv('/Users/Henna/Desktop/Scraping/Data/dataset_gur_iac.csv', encoding='ISO-8859-1')
print(len(df))
df.head()

text = df["text"]
y = df["resistance attempt"]

print(len(text))
print(len(y))

17354
17354
17354


In [2]:
nan_rows = df[df.isnull().T.any().T]
nan_rows

Unnamed: 0.1,Unnamed: 0,text,resistance attempt
9123,,The guy is a sleep scientist and yet he makes ...,1


In [16]:
type(str(text))

str

## Sentiment Features

In [11]:
from textblob import TextBlob

def sentiment_polarity(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None
    
def sentiment_subjectivity(text):
    try:
        return TextBlob(text).sentiment.subjectivity
    except:
        return None

df['polarity'] = df['text'].apply(sentiment_polarity)
df['subjectivity'] = df['text'].apply(sentiment_subjectivity)

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,resistance attempt
0,7719,Well yes.,0.0
1,7720,"Exact, to the point, & beautiful.",0.0
2,7721,"Yes, but it also has consequences.",0.0
3,7722,k,0.0
4,7723,And its all evolved via NDS - marvellous!,0.0


## Cleaning

In [3]:
import nltk
import re
import string
from nltk.stem.porter import *
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your chose
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = text.strip(' ') #strip whitespaces
    text = text.lower() #lowercase
    text = stem_text(text) #stemming
    text = remove_special_characters(text) #remove punctuation and symbols
    text = remove_stopwords(text) #remove stopwords
    #text.strip(' ') # strip white spaces again?

    return text

text = text.tolist()
cleaned_text = [clean_text(i) for i in text]
print(len(cleaned_text))

#tokenizing
tokenized_text = [word_tokenize(i) for i in cleaned_text]
tokenized_text[1]

17354


[u'exact', u'point', u'beauti']

## Word2Vec

In [30]:
from gensim.models import FastText
from gensim.models import Phrases

bigrams = Phrases(cleaned_text)

model_fasttext = FastText(cleaned_text=bigrams[sentences], size=300, window=5, min_count=5, workers=4, sg=1)

print(model_fasttext)
model_fasttext.wv.most_similar("gove")

#saving
#model_fasttext.save("w2v_model_fasttext_d300")
#model = Word2Vec.load("w2v_model_fasttext_d300")

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def filter_docs(corpus, texts, labels, polarity, subjectivity, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]
        
    polarity = [i for (i, doc) in zip(polarity, corpus) if condition_on_doc(doc)]
    subjectivity = [i for (i, doc) in zip(subjectivity, corpus) if condition_on_doc(doc)]
    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels, polarity, subjectivity)

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[doc], axis=0)

def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.wv.vocab for word in doc)

NameError: name 'sentences' is not defined

In [None]:
#Applying the functions
corpus = []
corpus = [preprocess(text) for text in clean_text]

corpus, texts, y, polarity, subjectivity = filter_docs(corpus, clean_text, y, polarity, subjectivity, lambda doc: (len(doc) != 0))

corpus, texts, y, polarity, subjectivity = filter_docs(corpus, clean_text, y, polarity, subjectivity, lambda doc: has_vector_representation(model_fasttext, doc))

print(len(corpus))
print(len(texts))
print(len(y))
print(len(polarity))
print(len(subjectivity))

In [None]:
#Making to numpy array
x =[]
for doc in corpus: #look up each doc in model
    x.append(document_vector(model_fasttext, doc))
    
X = np.array(x) #list to array