In [103]:
import warnings
warnings.filterwarnings("ignore")  

import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer #stemmer

from sklearn.feature_extraction.text import CountVectorizer #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer #For TF-IDF
from gensim.models import Word2Vec                          #For Word2Vec

import numpy as np
import nltk
import re
import string

In [87]:
dataframes = {
    "tweets": pd.read_csv("./crowdflower.csv")
}

In [88]:
print(dataframes["tweets"].iloc[1])

tweet_id                                            1956967666
sentiment                                              sadness
author                                               wannamama
content      Layin n bed with a headache  ughhhh...waitin o...
Name: 1, dtype: object


In [89]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

snow = nltk.stem.SnowballStemmer('english')
stops = set(stopwords.words("english"))
def stemAndRemoveStopwords(x):
    # Removing all the stopwords
    filtered_words = [snow.stem(word) for word in x.split() if word not in stops]
    return " ".join(filtered_words)
    
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)    
    return input_txt

switch_Emotions = {
    #happy angry sad neutral
    #anger boredom empty enthusiasm fun happiness hate love neutral relief sadness surprise worry
        "happy": "happy",
        "angry": "angry",
        "sad": "sad",
        "neutral": "neutral",
    
        "anger": "angry",
        "boredom": "neutral",
        "empty": "neutral",
        "enthusiasm": "happy",
        "fun": "happy",
        "happiness": "happy",
        "hate": "angry",
        "love": "happy",
        "relief": "happy",
        "sadness": "sad",
        "surprise": "angry",
        "worry" : "sad"
    }


In [90]:
for df in dataframes.values():
    df["content"] = np.vectorize(remove_pattern)(df["content"], "@[\w]*")
    df["content"] = df["content"].map(stripTagsAndUris)
    df["content"] = df["content"].map(removePunctuation)
    df["content"] = df["content"].map(stemAndRemoveStopwords)

    df["sentiment"] = df["sentiment"].map(switch_Emotions)

In [91]:
print(dataframes["tweets"].iloc[14])

tweet_id       1956970860
sentiment           angry
author       okiepeanut93
content          got news
Name: 14, dtype: object


In [92]:
for name, df in dataframes.items():
    # Saving to file
    df.to_csv(name + "_light.csv", index=False)

In [120]:
# Binary Bag of Words
count_vect = CountVectorizer(max_features=10000)
bow_data = count_vect.fit_transform(df["content"])
#print (bow_data[1])

# Bigram Bag of Words
#count_vect = CountVectorizer(ngram_range=(1,2))
#bbow_data = count_vect.fit_transform(df["content"])
#print (bbow_data)

ValueError: not enough values to unpack (expected 2, got 1)

In [102]:
# tf-idf
tf_idf = TfidfVectorizer(max_features=10000)
tf_data = tf_idf.fit_transform(df["content"])
#print(tf_data[1])

  (0, 5019)	0.5427952116026723
  (0, 1003)	0.275654676874232
  (0, 4033)	0.334054936674804
  (0, 9150)	0.48054452388509344
  (0, 9467)	0.4531087101920344
  (0, 1498)	0.2855951924909202


In [113]:
# Average Word2Vec from gensim

splitted = []
for row in df["content"]: 
    splitted.append([word for word in row.split()])     #splitting words
    
train_w2v = Word2Vec(splitted,min_count=5,size=50, workers=4)

avg_data = []
for row in splitted:
    vec = np.zeros(50)
    count = 0
    for word in row:
        try:
            vec += train_w2v[word]
            count += 1
        except:
            pass
    avg_data.append(vec/count)

#print(avg_data[1])


[ 0.43291656 -0.27668027 -0.493419    0.321446   -0.12003794 -0.42088827
  0.25616442 -0.24126766 -0.19893773 -0.41758203  0.01601632 -0.01655896
  0.41053424  0.19497878  0.56090855  0.82048703  0.04976484  0.08995618
  0.24982553  0.49340508  0.36752251 -0.04320811  0.20858104  0.53634716
 -0.08215591  0.2758802  -0.0517413  -0.23776088  0.12958361  0.34037551
  0.19576146  0.09431315  0.02896334  0.63303076  0.12666142 -0.22247616
  0.50168638  0.42224814 -0.07129398  0.17549671  0.56074589  0.17203297
 -0.29839978 -0.09352037 -0.27979904 -0.1571476  -0.0105348  -0.12540565
 -0.00448097  0.08642687]


2.742477177522826
0
2.3717532517197566
1
1.9712585830258789
2
1.9769513417784288
3
2.3585876622193664
4
2.516965644547606
5
2.865547782298292
6
1.0
7
1.412522102422573
8
1.7255215197053007
9
1.7239478430354396
10
1.4142135623730951
11
2.5791004168200535
12
2.416954993480823
13
1.3771123465271673
14
1.7119671991592453
15
1.0
16
1.9828936544589497
17
2.5770621420339586
18
3.068355022249613
19
2.768571682873503
20
3.046313968014893
21
2.8056288606471367
22
2.6170330729876268
23
2.9170522159706627
24
3.6217356604946835
25
2.651536582476009
26
2.5692015849162577
27
3.0600846012454532
28
1.2998582216634356
29
2.410582261419391
30
1.412450025221754
31
1.651836283485338
32
2.095873676119809
33
1.9269704776167327
34
1.3813699960621197
35
3.0368998019210793
36
1.412891030545142
37
1.358983454616329
38
1.8559697800404895
39
3.062740145724833
40
3.3959098252982223
41
2.2328278881793517
42
3.0423228429919456
43
3.1001369231399427
44
3.141688304964095
45
3.224793816981527
46
2.3491637579445444
47
2.

TypeError: can't multiply sequence by non-int of type 'float'