**Deep learning-based cryptocurrency
sentiment construction**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("../input/df.csv") 

In [None]:
df.head(10)

In [None]:
df.isnull().sum()

In [None]:
def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.isnull().count()*100)
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        data_type = []
        # written by MJ Bahmani
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(np.transpose(output))
    else:
        return(False)

In [None]:
check_missing_data(df)

In [None]:
df.groupby('class').describe()

In [None]:
# inspect sentiment
sns.countplot(df['class'])

In [None]:
df.info()

**Data Processing**

In [None]:
import nltk
from nltk.corpus import stopwords
import re
import string
def preprocess(s,remove_stopwords=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case 
    s = s.lower()

    # Clean the text with the same procedure
    s = re.sub(r"http\S+", "linktag", s) #linktag
    s = re.sub(r"@\S+", "usertag", s) #usertag
    s = re.sub(r"$\S+", "moneytag", s) #moneytag
    #s = re.sub("^\d+\s|\s\d+\s|\s\d+$", "numbertag", s) #numbertag
    s = re.sub(r" not ", " negtag_ ", s) #negtag_ added to "not", "no", "none","neither", "never" "nobody"
    s = re.sub(r" none ", " negtag_ ", s)
    s = re.sub(r" no ", " negtag_ ", s)
    s = re.sub(r" neither ", " negtag_ ", s)
    s = re.sub(r" never ", " negtag_ ", s)
    s = re.sub(r" nobody ", " negtag_ ", s)
    s = re.sub(r"(RT|via)((?:\\b\\W*@\\w+)+)", " ", s) # 
    s = re.sub(r"&amp", " ", s) #
    s = re.sub(r" RT ", " ", s) #
    
    # remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        s = [w for w in s if not w in stops]

    # remove punctuation except "!" and "?" from each word
    remove = string.punctuation
    remove = remove.replace("?", "") # don't remove Interrogation marks
    remove = remove.replace("!", "") # don't remove Exclamation marks
    pattern = r"[{}]".format(remove) # create the pattern
    re.sub(pattern, "", s) 
    
    # Return a list of words
    return(s)
df['clean_tweet'] = df['tweets'].apply(preprocess)

In [None]:
# On affiche les tweets contenant ? ou !
for i in range(df.shape[0]):
    if('?' in df.iloc[i,0] or '!' in df.iloc[i,0]):
        print(i,' ',df.iloc[i,0],'||',df.iloc[i,2])

In [None]:
from wordcloud import WordCloud
text = df['clean_tweet'].to_string().lower()    
wordcloud = WordCloud(
    collocations=False,
    relative_scaling=0.5,
    stopwords=set(stopwords.words('english'))).generate(text)

In [None]:
text = df['clean_tweet'].to_string().lower()    
wordcloud = WordCloud(
    collocations=False,
    relative_scaling=0.5,
    stopwords=set(stopwords.words('english'))).generate(text)

plt.figure(figsize=(12,12))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

Une première remarque : on voit linktag est très bien présentée, du coup peut être qu'il n'était pas judicieux d'uniformiser les liens puisqu'ils n'ont peut être pas la même valeur

**RNN algorithm setup**

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten,Reshape
from keras.layers import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras.layers import LSTM, LeakyReLU
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import h5py
import os
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Embedding,Conv1D,MaxPooling1D,LSTM
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

**RNN without pretraining**

In [None]:
for i in range(df.shape[0]):
    if(df["class"][i]=="['positive']"):
        df["class"][i]=1
    if(df["class"][i]=="['negative']"):
        df["class"][i]=0

In [None]:
df = df.dropna(subset=['class'])

In [None]:
seed = 201 # fix random seed for reproducibility
np.random.seed(seed)
X, y = (df["clean_tweet"].values, df["class"].values)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tk = Tokenizer(lower = True)
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=256, padding='post')

In [None]:
X_pad=pd.DataFrame(X_pad)
X_pad.head(10)

In [None]:
X[0]

In [None]:
len(X_pad[0])

In [None]:
len(X[0])

In [None]:
X_pad.shape,df.shape,y.shape

In [None]:
X_n = X_pad[df['class']==0]
X_p = X_pad[df['class']==1]
y_p = y[df['class']==1]
y_n = y[df['class']==0]

In [None]:
X_p.shape,y_p.shape,X_n.shape,y_n.shape

In [None]:
# Split Train Test sets, en essayant d'avoir des proportions équilibrées de "positive" & "negative"

X_p_train, X_p_test,y_p_train,y_p_test = train_test_split(X_p,y_p, test_size=0.3, random_state=seed)

X_n_train, X_n_test,y_n_train,y_n_test = train_test_split(X_n,y_n, test_size=0.3, random_state=seed)

X_p_train=pd.DataFrame(X_p_train)
X_n_train=pd.DataFrame(X_n_train)
X_train=X_p_train.append(X_n_train)  

y_p_train=pd.DataFrame(y_p_train)
y_n_train=pd.DataFrame(y_n_train)
y_train=y_n_train.append(y_p_train)

X_p_test=pd.DataFrame(X_p_test)
X_n_test=pd.DataFrame(X_n_test)
X_test=X_p_test.append(X_n_test)

y_p_test=pd.DataFrame(y_p_test)
y_n_test=pd.DataFrame(y_n_test)
y_test=y_p_test.append(y_n_test)

In [None]:
batch_size = 128
X_train1 = X_train[batch_size:]
y_train1 = y_train[batch_size:]
X_valid = X_train[:batch_size]
y_valid = y_train[:batch_size]

> LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
max_words = 100
embedding_size = 100
model = Sequential()
model.add(Embedding(20000, embedding_size, input_length=max_words))
model.add(LSTM(64))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])

In [None]:
model_history = model.fit(X_train1, y_train1, validation_data=(X_valid, y_valid),epochs=50, batch_size=batch_size, verbose=2)

In [None]:
scores=model.evaluate(X_test,y_test,verbose=0)

In [None]:
print(scores[1]) 

> GRU

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout

max_words = 100
embedding_size = 32
model = Sequential()
model.add(Embedding(20000, embedding_size, input_length=max_words))
model.add(keras.layers.GRU(128))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])
model_history = model.fit(X_train1, y_train1, validation_data=(X_valid, y_valid),epochs=50, batch_size=batch_size, verbose=2)

In [None]:
scores=model.evaluate(X_test,y_test,verbose=0)
print(scores[1]) 

In [None]:
?Sequential().add(Embedding())

In [None]:
# je trouve aussi 0.7931312665667858 comme accuracy
# en variant le paramètre vocabulary_size on voit qu'il a une importance forte sur le score final

**RNN pretrained with Word2Vec Skip-gram**

> Skip gram

In [None]:
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing

In [None]:
from gensim.models import KeyedVectors
#model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
#model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

In [None]:
 from nltk import word_tokenize

In [None]:
# on met en places les sentences à partir des quelles on va construire le vocabulaire
from nltk import word_tokenize
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df['clean_tweet']]
phrases = Phrases(sent, max_vocab_size = 50, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
sentences.corpus[0][-1]

In [None]:
?Phrases()

In [None]:
from collections import defaultdict  
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

Another method

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

from sklearn.linear_model import LogisticRegression

cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(window=10,
                     size = 256,
                     sg=1)
w2v_model.build_vocab(sentences)

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=25, report_delay=1)

#for epoch in range(25):
    #w2v_model.train(utils.shuffle([x for x in tqdm(X)]), total_examples=len(X), epochs=1)
    #w2v_model.alpha -= 0.002
    #w2v_model.min_alpha = model_ug_dbow.alpha

In [None]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        vecs[i] = np.zeros(size).reshape((1, size))
        for word in str(corpus[0][i]).split():
            try:
                vecs[i] += model[word]
                n += 1
            except KeyError:
                continue
                
    return vecs
  


#train_vecs_dbow = get_vectors(w2v_model.train., X, 256)
#validation_vecs_dbow = get_vectors(w2v_model, x_validation, 100)

#clf = LogisticRegression()
#clf.fit(train_vecs_dbow, y_train)
#clf.score(validation_vecs_dbow, y_validation)

In [None]:
sentences.corpus[1]

In [None]:
X=pd.DataFrame(X) # numpy objects have no split() attribute 

In [None]:
X_sg = get_vectors(w2v_model, X, 256)

In [None]:
X_sg=pd.DataFrame(X_sg)

In [None]:
from sklearn import preprocessing
scale = preprocessing.normalize
X_sg=scale(X_sg)

In [None]:
X_sg=pd.DataFrame(X_sg)+1
X_sg.head()

In [None]:
new_df=X_sg
new_df["class"]=y

In [None]:
X_sg_n = new_df[new_df['class']==0]
X_sg_p = new_df[new_df['class']==1]

In [None]:
X_sg_n = X_sg_n.iloc[:,0:256]
X_sg_p = X_sg_p.iloc[:,0:256]

In [None]:
X_sg_n.shape

In [None]:
y_p = y[df['class']==1]
y_n = y[df['class']==0]

In [None]:
X_p_train, X_p_test,y_p_train,y_p_test = train_test_split(X_sg_p, y_p, test_size=0.2, random_state=seed)

X_n_train, X_n_test,y_n_train,y_n_test = train_test_split(X_sg_n, y_n, test_size=0.2, random_state=seed)

X_p_train=pd.DataFrame(X_p_train)
X_n_train=pd.DataFrame(X_n_train)
X_train=X_p_train.append(X_n_train)  

y_p_train=pd.DataFrame(y_p_train)
y_n_train=pd.DataFrame(y_n_train)
y_train=y_n_train.append(y_p_train)

X_p_test=pd.DataFrame(X_p_test)
X_n_test=pd.DataFrame(X_n_test)
X_test=X_p_test.append(X_n_test)

y_p_test=pd.DataFrame(y_p_test)
y_n_test=pd.DataFrame(y_n_test)
y_test=y_p_test.append(y_n_test)


#batch_size = 128
#X_train1, X_valid, y_train1, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)




In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

> LSTM

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils
import tensorflow as tf

In [None]:
len(sentences)

In [None]:
X_train.shape

In [None]:
w2v_model.vector_size

In [None]:
X_train.min()

In [None]:
embed_dim = 256
lstm_out = 64
batch_size = 64


model = Sequential()
model.add(Embedding(2000, 64,  input_length = 256 ))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])
model_history = model.fit(X_train, y_train, epochs=50, batch_size=batch_size, verbose=2)


In [None]:
scores=model.evaluate(X_test, y_test)
print(scores[1]) 

> GRU

In [None]:
model = Sequential()
model.add(Embedding(20000, embedding_size, input_length=max_words))
model.add(keras.layers.GRU(128))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy'])
model_history = model.fit(X_train1, y_train1, validation_data=(X_valid, y_valid),epochs=epochs, batch_size=batch_size, verbose=2)

In [None]:
scores=model.evaluate(validation_vecs_cbowsg_sum, y_validation,verbose=0)
print(scores[1]) 

Malheureseument à cause du temps on n'a pas pu generer les rendements pour sentiments
Ni les types de régressions prédictives pour les séries temporelles de retour de journal d’index de crypto-monnaie (la moyenne autorégressive et la variance).