In [37]:
import os
import sys
import gensim
import pandas as pd
from gensim.models.doc2vec import TaggedDocument
import sklearn
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import stopwords
import string
import regex as re

In [2]:
data=pd.read_csv('eng_tweets.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,good afternoon time coffee
1,1,hello respect board deputies british people je...
2,2,hello respect board deputies british people we...
3,3,religions know meaning word heavenly god
4,4,hello respect board deputies british people je...


In [3]:
data=data.dropna()

In [61]:
data.shape

(4037352, 3)

In [20]:
data['0'][0]

'good afternoon time coffee'

In [4]:
from sklearn.model_selection import train_test_split
SEED = 2000

In [5]:
x_train, x_validation=train_test_split(data['0'],test_size=.2,random_state=SEED)

In [6]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result
  
all_x = pd.concat([x_train,x_validation])

all_x_w2v = labelize_text(all_x, 'ALL')
x_train = labelize_text(x_train, 'TRAIN')
x_validation = labelize_text(x_validation, 'TEST')

In [7]:
model_w2v = Word2Vec(vector_size=200, min_count=10)
model_w2v.build_vocab([x.words for x in tqdm(all_x_w2v)])
model_w2v.train([x.words for x in tqdm(all_x_w2v)], total_examples=len(all_x_w2v), epochs=1)

100%|███████████████████████████████████████████████████████████████████| 4037352/4037352 [00:01<00:00, 2079133.56it/s]
100%|███████████████████████████████████████████████████████████████████| 4037352/4037352 [00:01<00:00, 3334052.29it/s]


(30018662, 31468385)

In [8]:
model_w2v.wv.most_similar('good')

[('bad', 0.6854493618011475),
 ('tough', 0.652366042137146),
 ('great', 0.6457939743995667),
 ('decent', 0.5944512486457825),
 ('excellent', 0.5611194968223572),
 ('nice', 0.5434066653251648),
 ('interesting', 0.5338517427444458),
 ('positive', 0.5309790372848511),
 ('fantastic', 0.5115781426429749),
 ('exciting', 0.5033203959465027)]

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in all_x_w2v])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

def build_Word_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            
            continue
    if count != 0:
        vec /= count
    return vec



In [10]:
word_vectors = model_w2v.wv
word_vectors.save('vectors.kv')
reloaded_word_vectors = KeyedVectors.load('vectors.kv')

In [11]:
positive_concepts = ['excellent', 'awesome', 'cool','decent','amazing', 'strong', 'good', 'great', 'funny', 'entertaining'] 
pos_concepts = [concept for concept in positive_concepts if concept in reloaded_word_vectors]

In [12]:
negative_concepts = ['terrible','awful','horrible','boring','bad', 'disappointing', 'weak', 'poor',  'senseless','confusing'] 
neg_concepts = [concept for concept in negative_concepts if concept in reloaded_word_vectors]

In [13]:
# topn_df_scores = topn_semantic_sentiment_analysis (keyed_vectors = reloaded_word_vectors,
#                                                    positive_target_tokens = pos_concepts, 
#                                                    negative_target_tokens = neg_concepts,
#                                                    doc_tokens = data['0'],
#                                                      topn=30)

In [38]:
words = set(nltk.corpus.words.words())
def remove_word(text,words):
    x=" ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    return x
data['edit']=data['0'].apply(lambda x: remove_word(x,words))

def remove_punc(text):
    puncfree="".join([i for i in text if i not in string.punctuation])
    return puncfree
data['edit']=data['edit'].apply(lambda x:remove_punc(x))
data['edit']=data['edit'].apply(lambda x:x.lower())

def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
data['edit']=data['edit'].apply(lambda x: tokenization(x))
def make_list(text):
    l=list(text.strip().split())
    return l
data['edit']=data['edit'].apply(lambda x: make_list(x[0]))

In [None]:
sentiment1=[]
for ii ,idx in data.iterrows():
    s=0
    for j in idx['edit']:
        if j in reloaded_word_vectors:
            p=0
            n=0
            for i in model_w2v.wv.most_similar(j):
                if i[0] in positive_concepts:
                    p+=i[1]
                elif i[0] in negative_concepts:
                    n+=i[1]
            s+=(p-n)
    if s>0:
        sentiment1.append('Positive')
    elif s<0:
        sentiment1.append('Negative')
    else:
        sentiment1.append('Neutral')

In [44]:
p=0
n=0
for x in model_w2v.wv.most_similar('win'):
    if x[0] in positive_concepts:
        p+=x[1]
    elif x[0] in negative_concepts:
        n+=x[1]
print(p,n)

0 0


In [48]:
model_w2v.wv.most_similar('fun')

[('enjoy', 0.6119229197502136),
 ('activities', 0.5808720588684082),
 ('busy', 0.5795847177505493),
 ('enjoyable', 0.5715605020523071),
 ('enjoying', 0.5671053528785706),
 ('awesome', 0.5581031441688538),
 ('fab', 0.5524030327796936),
 ('exciting', 0.5494546294212341),
 ('amazing', 0.5440994501113892),
 ('enjoyed', 0.5433796644210815)]

In [43]:
data.head()

Unnamed: 0.1,Unnamed: 0,0,edit
0,0,good afternoon time coffee,"[good, afternoon, time, coffee]"
1,1,hello respect board deputies british people je...,"[hello, respect, board, people, welcome, morocco]"
2,2,hello respect board deputies british people we...,"[hello, respect, board, people, welcome, morocco]"
3,3,religions know meaning word heavenly god,"[know, meaning, word, heavenly, god]"
4,4,hello respect board deputies british people je...,"[hello, respect, board, people, welcome, morocco]"


In [74]:
len(sentiment1)

33850

In [64]:
bagofwords=set()
for i,idx in data.iterrows():
    for x in idx['edit']:
        bagofwords.add(x)
        

In [63]:
pd.Series(sentiment).value_counts()

Neutral     102656
Positive     19940
Negative      4303
dtype: int64