In [1]:
from scipy import sparse
import time
import nltk
from collections import Counter
import pandas as pd

import csv
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize,blankline_tokenize
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import re
import string
from collections import Counter
import json
import re as regex

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV


In [2]:
class TwitterData_Initialize():
    data = []
    processed_data = []
    wordlist = []

    featureList = []
    fea_vect=[]
    
    data_model = None
    data_labels = None
    is_testing = False
    
    def initialize(self, csv_file, is_testing_set=False, from_cached=None):
        if from_cached is not None:
            #self.data_model = pd.read_csv(from_cached)
            self.data_model = pd.read_json(from_cached)

            return

        self.is_testing = is_testing_set

        if not is_testing_set:
            #self.data = pd.read_csv(csv_file, header=0, names=["id", "emotion", "text"])
            self.data = pd.read_json(csv_file)

            #self.data = self.data[self.data["emotion"].isin(["positive", "negative", "neutral"])]
        

        self.processed_data = self.data
        self.wordlist = []
        self.data_model = None
        self.data_labels = None
        
    
    
    
    def do_process(self):
        start_time = time.time()
        def stem_and_join(row,stemmer=nltk.PorterStemmer()):
            row["spans"] = list(map(lambda str: stemmer.stem(str.lower()), row["spans"]))
            return row
    
        def tokenize_grams(row):
                
                # turn a doc into clean tokens
                def clean_doc(doc):
                    # split into tokens by white space
                    tokens = doc.split()
                    # remove punctuation from each token
                    table = str.maketrans('', '', string.punctuation)
                    tokens = [w.translate(table) for w in tokens]
                    # remove remaining tokens that are not alphabetic
                    tokens = [word for word in tokens if word.isalpha()]
                    # filter out stop words
                    #stop_words = set(stopwords.words('english'))
                    
                    #tokens = [w for w in tokens if not w in stop_words]
                    
                    
                    # filter out short tokens
                    tokens = [word.lower() for word in tokens if len(word) > 1]
                    return tokens
                




               
                # Function to apply lemmatization to a list of words
                def words_lemmatizer(words, encoding="utf8"):
                    wordnet_lemmatizer = WordNetLemmatizer()

                    lemma_words = []
                    wl = WordNetLemmatizer()
                    for word in words:
                        pos = find_pos(word)
                        lemma_words.append(wl.lemmatize(word, pos))
                    return lemma_words
                
                # Function to find part of speech tag for a word
                def find_pos(word):
                    # Part of Speech constants
                    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
   
                    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
                    # Adjective tags - 'JJ', 'JJR', 'JJS'
                    if pos.lower()[0] == 'j':
                        return 'a'
                    # Adverb tags - 'RB', 'RBR', 'RBS'
                    elif pos.lower()[0] == 'r':
                        return 'r'
                    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
                    elif pos.lower()[0] == 'v':
                        return 'v'
                    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
                    else:
                        return 'n'
    
            
                #convert to string
                idx=row["spans"]
                #ch="".join(x for x in idx if x)
                ch=' '.join(idx)
               
    
                
                
                n_grams=clean_doc(ch)
                tokens=words_lemmatizer(n_grams)

                #print("nombre of tokens",len(n_grams))

                row["token_spans"] = tokens
    
                return row
        #self.processed_data = self.processed_data.apply(stem_and_join, axis=1)
        self.processed_data = self.processed_data.apply(tokenize_grams, axis=1)
        print("--- %s seconds ---" % (time.time() - start_time))

    

    def build_wordlist(self, min_occurrences=0, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
                       ):
       
        self.wordlist = []
        whitelist=["to","on","for","up","below","short","long"]
        
        #stopwords=[]
       
        words = Counter()
      
        for idx in self.processed_data.index:
            words.update(self.processed_data.loc[idx, "token_spans"])

        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]
        
        
        print(words.most_common())        

        word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])

        word_df.to_csv("wordlist_copie.csv", index_label="idx")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
        
        
        
    def build_data_model(self):
        label_column = []
        Id_column=["ID"]
        label_column = ["label"]

        columns = Id_column + label_column + list(
            map(lambda w: w ,self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_data.index:
            current_row = []

            if True:
                # add label
                current_label = self.processed_data.loc[idx, "sentiment score"]
                current_id = self.processed_data.loc[idx, "id"]
                
                labels.append(current_id)
                labels.append(current_label)
                
                current_row.append(current_id)
                current_row.append(current_label)

            # add bag-of-words
            tokens = set(self.processed_data.loc[idx, "token_spans"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)

        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels


In [3]:
class TwitterData_Initialize_test(TwitterData_Initialize):

    
    
    def do_process(self):
        def stem_and_join(row,stemmer=nltk.PorterStemmer()):
            
            row["spans"] = list(map(lambda str: stemmer.stem(str.lower()), row["spans"]))
            return row
    
        def tokenize_grams(row):
                
                # Function to remove stop words
                def remove_stopwords(text, lang='english'):
                    whitelist = ["n't","not","below"]    

                    stop_words = set(stopwords.words('english'))
                    word_tokens = word_tokenize(text)
                    #filtered_sentence = [w for w in word_tokens if ((not w in stop_words) or (w in whitelist))]
                    filtered_sentence = []
                    for w in word_tokens:
                        if ((w not in stop_words) or (w in whitelist)):
                            filtered_sentence.append(w)
                
                    ch=" ".join(filtered_sentence)

                    return ch
#40

                def clean_doc(doc):
                    # split into tokens by white space
                    tokens = doc.split()
                    # remove punctuation from each token
                    table = str.maketrans('', '', string.punctuation)
                    tokens = [w.translate(table) for w in tokens]
                    # remove remaining tokens that are not alphabetic
                    tokens = [word for word in tokens if word.isalpha()]
                    # filter out stop words
                    #stop_words = set(stopwords.words('english'))
                    #tokens = [w for w in tokens if not w in stop_words]
                    
                    # filter out short tokens
                    tokens = [word.lower() for word in tokens if len(word) > 1]
                    return tokens


                

                # Function to apply lemmatization to a list of words
                def words_lemmatizer(words, encoding="utf8"):
                    wordnet_lemmatizer = WordNetLemmatizer()

                    lemma_words = []
                    wl = WordNetLemmatizer()
                    for word in words:
                        pos = find_pos(word)
                        lemma_words.append(wl.lemmatize(word, pos))
                    return lemma_words

                # Function to find part of speech tag for a word
                def find_pos(word):
                    # Part of Speech constants
                    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
   
                    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
                    # Adjective tags - 'JJ', 'JJR', 'JJS'
                    if pos.lower()[0] == 'j':
                        return 'a'
                    # Adverb tags - 'RB', 'RBR', 'RBS'
                    elif pos.lower()[0] == 'r':
                        return 'r'
                    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
                    elif pos.lower()[0] == 'v':
                        return 'v'
                    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
                    else:
                        return 'n'
    
            
                token=[]
                idx=row["spans"]
                ch=' '.join(idx)


    
                #words_stemmer(ch, type="PorterStemmer", lang="english", encoding="utf8")
                #Convert to lower case
                n_grams=clean_doc(ch)
                tokens=words_lemmatizer(n_grams)

                #print("list ngrams--------------\n",n_grams)
                row["token_spans"] = tokens
    
                return row
        #self.processed_data = self.processed_data.apply(stem_and_join, axis=1)
        self.processed_data = self.processed_data.apply(tokenize_grams, axis=1)
        
    def build_wordlist(self, min_occurrences=0, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
                      ):
        
        whitelist=["to","on","for","up","below","short","long"]
        self.wordlist = []
        #stopwords=[]
        words = Counter()
      
        for idx in self.processed_data.index:
            words.update(self.processed_data.loc[idx, "token_spans"])

        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]
        
        
        print(words.most_common())        

        word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])

        word_df.to_csv("wordlist_test.csv", index_label="idx")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
        
        
    def build_data_model(self):
        
        label_id = ["ID"]

        columns = label_id + list(
            map(lambda w: w ,self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_data.index:
            current_row = []
            if True:
                # add label
                current_id = self.processed_data.loc[idx, "id"]
                
                labels.append(current_id)
                
                current_row.append(current_id)

            

            # add bag-of-words
            tokens = set(self.processed_data.loc[idx, "token_spans"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)

        self.data_model = pd.DataFrame(rows, columns=columns)
        return self.data_model


In [4]:
data = TwitterData_Initialize()
data.initialize("Microblog_Trainingdata.json")

data.processed_data.head(20)

Unnamed: 0,cashtag,id,sentiment score,source,spans
0,$FB,719659409228451840,0.366,twitter,[watching for bounce tomorrow]
1,$LUV,719904304207962112,0.638,twitter,[record number of passengers served in 2015]
2,$NFLX,5329774,-0.494,stocktwits,[out $NFLX -.35]
3,$DIA,719891468173844480,0.46,twitter,"[Looking for a strong bounce, Lunchtime rally ..."
4,$PLUG,20091246,0.403,stocktwits,[Very intrigued with the technology and growth...
5,$GMCR,5819749,0.0,stocktwits,"[short worked, puts up]"
6,$IBM,709741154393133056,-0.296,twitter,[overbought]
7,$JOSB,17892972,-0.546,stocktwits,"[absolute garbage still up, stores TOTALLY EMP..."
8,$CSTM,709834259687710720,-0.438,twitter,[Biggest Market Losers]
9,$PYPL,708481442079068160,0.408,twitter,[Love this company long time.]


In [5]:
data.do_process()
data.processed_data.head()
data.processed_data

--- 323.9797320365906 seconds ---


Unnamed: 0,cashtag,id,sentiment score,source,spans,token_spans
0,$FB,719659409228451840,0.366,twitter,[watching for bounce tomorrow],"[watch, for, bounce, tomorrow]"
1,$LUV,719904304207962112,0.638,twitter,[record number of passengers served in 2015],"[record, number, of, passenger, serve, in]"
2,$NFLX,5329774,-0.494,stocktwits,[out $NFLX -.35],"[out, nflx]"
3,$DIA,719891468173844480,0.460,twitter,"[Looking for a strong bounce, Lunchtime rally ...","[look, for, strong, bounce, lunchtime, rally, ..."
4,$PLUG,20091246,0.403,stocktwits,[Very intrigued with the technology and growth...,"[very, intrigue, with, the, technology, and, g..."
5,$GMCR,5819749,0.000,stocktwits,"[short worked, puts up]","[short, work, put, up]"
6,$IBM,709741154393133056,-0.296,twitter,[overbought],[overbought]
7,$JOSB,17892972,-0.546,stocktwits,"[absolute garbage still up, stores TOTALLY EMP...","[absolute, garbage, still, up, store, totally,..."
8,$CSTM,709834259687710720,-0.438,twitter,[Biggest Market Losers],"[big, market, loser]"
9,$PYPL,708481442079068160,0.408,twitter,[Love this company long time.],"[love, this, company, long, time]"


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def get_tweets(data):
    start_time = time.time()

    tweets=list(data.processed_data.token_spans)
    
    list_tweet=[]
    for i,chaine in enumerate(tweets):
        ch=" ".join(chaine)
        list_tweet.append(ch)
    print(type(list_tweet))
    print("--- %s seconds ---" % (time.time() - start_time))

    return list_tweet

def get_count_vector(corpus):
    vect = CountVectorizer()#appel à l'objet counvectorizer
    vect.fit(corpus)#construction du vecteur de mots
    #print("Vocabulary size: {}".format(len(vect.vocabulary_)))
    #print("Vocabulary content:\n {}".format(vect.vocabulary_))
    vect.get_stop_words()
    X = vect.transform(corpus)
    #print("bag_of_words: {}".format(repr(X)))
    #print("Dense representation of bag_of_words:\n{}".format(
    #X.toarray()))
    return vect,X
   
    
#sparse_matrix = sparse.csr_matrix(bag_of_words)
#print("\nSciPy sparse CSR matrix:\n{}".format(sparse_matrix))

In [8]:
cols = ['sentiment score']  + [col for col in data.processed_data if col != 'sentiment score']
data.processed_data=data.processed_data[cols]
data.processed_data.iloc[:,0:]
data.processed_data.iloc[:,0]

0       0.366
1       0.638
2      -0.494
3       0.460
4       0.403
5       0.000
6      -0.296
7      -0.546
8      -0.438
9       0.408
10     -0.398
11     -0.349
12      0.025
13      0.486
14      0.308
15     -0.372
16      0.461
17      0.408
18     -0.699
19      0.495
20      0.306
21     -0.385
22      0.336
23      0.279
24      0.591
25     -0.351
26     -0.514
27     -0.519
28      0.361
29     -0.042
        ...  
1670   -0.385
1671    0.281
1672   -0.720
1673   -0.351
1674    0.365
1675   -0.382
1676   -0.186
1677   -0.409
1678    0.412
1679    0.465
1680    0.362
1681    0.220
1682    0.365
1683   -0.248
1684   -0.152
1685    0.435
1686    0.414
1687    0.471
1688   -0.589
1689   -0.513
1690   -0.581
1691    0.087
1692    0.230
1693    0.813
1694    0.380
1695   -0.126
1696    0.295
1697    0.405
1698    0.296
1699   -0.296
Name: sentiment score, Length: 1700, dtype: float64

In [9]:
"""train_data, test_data, y_train, y_test = train_test_split(data.processed_data.iloc[:,1:], data.processed_data.iloc[:,0],
                                                   train_size=0.7)
corpus=get_tweets(train_data)
len(corpus)"""
corpus=get_tweets(data)
with open("tweets.txt","w") as fichier:
    fichier.writelines("\n".join(corpus))

<class 'list'>
--- 0.0009953975677490234 seconds ---


In [27]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    print(len(vec.get_feature_names()))
    print(bag_of_words[0])

    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]



common_words = get_top_n_words(corpus, 20)
for word, freq in common_words:
    print(word, freq)

1921
  (0, 203)	1
  (0, 653)	1
  (0, 1718)	1
  (0, 1832)	1
be 289
the 249
to 235
in 151
on 133
for 128
stock 126
of 110
up 109
short 95
look 83
and 80
it 76
buy 76
long 74
this 73
today 68
sell 66
high 66
call 59


In [81]:
corpus=get_tweets(data)
lst=get_top_n_words(corpus)
with open("top_words.csv", "w",newline='') as fichier:
    c = csv.writer(fichier,delimiter=",")

    for l in lst:
        c.writerow(l)
    

<class 'list'>
--- 0.0009984970092773438 seconds ---
set()


In [82]:
data_test = TwitterData_Initialize_test()
data_test.initialize("Microblog_Trialdata.json")

data_test.processed_data.head()
data_test.do_process()

In [93]:


train_vec,X=get_count_vector(corpus)
labels=data.processed_data["sentiment score"].values


In [96]:
words=list(train_vec.get_feature_names())
X_train=pd.DataFrame(X.toarray(),columns=words)
y_train=pd.DataFrame(labels,columns=['label'])


In [97]:
import xgboost as xgb
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
#---------------------------XGBregressor---------------------------------------------------

#Fitting XGB regressor with parameters obtained by Grid searchCV
"""
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)],
    'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 1400, num = 7)]
}

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
model = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, max_depth=4,
 min_child_weight=4,  subsample=0.8, colsample_bytree=1.0, nthread=4), 
 param_grid = param_test3,scoring='neg_mean_squared_error',cv=5)
model.fit(X_train,y_train)

"""
#model = xgb.XGBRegressor(max_depth=4, min_child_weight= 4,gamma=0.3,subsample=0.8,colsample_bytree=1.0,n_estimators=1000)
XGBR=xgb.XGBRegressor(learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
XGBR.fit(X_train,y_train)

print (XGBR)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)


In [98]:
test_corpus=get_tweets(data_test)
test_corpus

<class 'list'>
--- 0.0 seconds ---


['put on little short',
 'short some',
 'buying opportunity',
 'scale up on long position',
 'it time to sell bank',
 'enter long',
 'picked some up',
 'time to accumulate for long position far more upside than downside',
 'look for strong bounce lunchtime rally come',
 'very intrigue with the technology and growth potential',
 'work put up',
 'big market loser',
 'goog googl would suck',
 'buying sbux on dip',
 'be short below and be overbought',
 'dont put on down little short']

In [99]:
common_words = get_top_n_words(test_corpus, 20)
for word, freq in common_words:
    print(word, freq)
print(len(test_corpus))

set()
on 4
short 4
put 3
up 3
long 3
little 2
some 2
buying 2
position 2
time 2
to 2
for 2
and 2
be 2
opportunity 1
scale 1
it 1
sell 1
bank 1
enter 1
16


In [116]:
test_vec,X_=get_count_vector(test_corpus)
labels_=data_test.processed_data["sentiment score"].values
test_words=list(test_vec.get_feature_names())
X_test=pd.DataFrame(X_.toarray(),columns=test_words)
y_test=pd.DataFrame(labels_,columns=['label'])

In [117]:
bow_columns=list(X_train.columns)
for word in bow_columns:
    current_row=[]
    if (not word  in test_words):
        # add label
        for idx in X_test.index:
            current_row.append(0)
        X_test[word]=current_row
X_test=X_test[X_train.columns]

In [118]:
#Predict 
#output=grid.best_estimator_.predict(X_test)
tweet_row=list(data_test.processed_data["spans"])
#old_pred=list(data_test.processed_data["sentiment score"])
old_pred=list(y_test.values)
print(len(old_pred))
output = XGBR.predict(X_test)
#final_df = pd.DataFrame(tweet_row, columns=labels)
final_df = pd.DataFrame()
final_df["tweet"] = tweet_row

final_df["old_pred"] = old_pred

final_df["Prediction"] = output
final_df.to_csv("Output_1.csv",sep=",")
final_df.head(20)

16


Unnamed: 0,tweet,old_pred,Prediction
0,[Putting on a little $F short],[-0.454],-0.774885
1,[short some],[-0.464],-0.335771
2,[buying opportunity],[0.445],0.415726
3,[Scaling Up on Long Position],[0.661],0.692813
4,[its time to sell banks],[-0.763],-0.521347
5,[Entering long],[0.627],0.470386
6,[picked some up],[0.653],0.397556
7,"[time to accumulate for a long position, far m...",[0.668],0.617554
8,"[Looking for a strong bounce, Lunchtime rally ...",[0.46],0.493107
9,[Very intrigued with the technology and growth...,[0.403],0.393143


In [119]:
#test model accuracy
from sklearn.metrics import r2_score
from sklearn.metrics importean_squared_error

ss
print("mean squared error:" ,mean_squared_error(output, y_test))
print("R2 score:" ,r2_score(output,y_test))

mean squared error: 0.03683362613575526
R2 score: 0.8713257051094979
