## Import the Pickle

In [1]:
import pickle
from langdetect import detect
import re
import string
#from project_functions import *
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import sentiment_mod as s

In [2]:
!ls /home/jovyan/work/2_Semester

'API Tweeter.ipynb'   featuresets.pickle	      Text_Mining_MASTER
 backup		     'Machine Learning'		      tweets_df.csv
'Deep Learning'      'PLaying with tutorials.ipynb'   tweets_jan2019


In [3]:
pickle_tweets = open("/home/jovyan/work/2_Semester/tweets_jan2019", "rb")
example_tweets = pickle.load(pickle_tweets)
example_tweets = example_tweets[["datetime","text"]]
example_tweets.head()

Unnamed: 0,datetime,text
0,2019-01-01 00:00:00,"@ #1, Bitcoin with unit price of $3,742.7, m..."
1,2019-01-01 00:00:00,Learn it. 2018 Sees Bitcoin ’s Lowest Average...
2,2019-01-01 00:00:01,"仮想通貨の時価総額 $125,622,003,150 BTC 価格:$3734.04..."
3,2019-01-01 00:00:01,IAM Platform Curated Retweet: Via: https:// ...
4,2019-01-01 00:00:01,"Bitcoin - BTC Price: $3,742.70 Change in 1h: ..."


### Take a sample

In [4]:
example_tweets = example_tweets.head(100)

### Filter on English Tweets

In [5]:
def filter_english(dataframe):
    blanco = "blanco"
    text_column = dataframe["text"]
    
    # Create a list saving all the languages of the tweets
    language_list =[]

    for i in text_column:

        try:
            language = detect(i)
            language_list.append(language)
        except:
            language_list.append(blanco)    
    
    dataframe["Language"] = language_list
    
    return dataframe.loc[dataframe['Language'] == "en"]


In [6]:
example_tweets = filter_english(example_tweets)

### Clean Tweets

In [7]:
def removal_function(dataframe):
    new_text = []
    text_column = dataframe["text"]
    for i in text_column:
        y = i

        y = re.sub(r"@[A-Z-a-z-0-9_.]+","", y) #remove users with@
        y = y.replace("\n"," ") # remove enters
        y= re.sub(r"http\S+","",y) # removes links
        y= re.sub("\s+"," ",y)  #removes more one spaces
        y= re.sub(r"&(amp;)", "&", y) # removes and in html format
        y = re.sub(r"[0-9]","",y) #remove numbers
        y=re.sub(r"(.+?)\1+",r"\1",y) #remove repeted letters
        y= re.sub("\s+"," ",y) #remove more one space

        i = y
        new_text.append(i)
        
    dataframe["text"] = new_text
    return dataframe

In [8]:
example_tweets = removal_function(example_tweets)

### Tokenize, Remove Stopwords, Lemmatize, Stemmatize

In [9]:
def tokenize_dataframe(dataframe):
    text_column = dataframe["text"]
    new_text = []
    
    for i in text_column:
        i = i.lower()
        i = RegexpTokenizer(r'\w+').tokenize(i)
        new_text.append(i)
        
    text_column = new_text
    dataframe["text"] = text_column
    
    return dataframe

In [10]:
example_tweets = tokenize_dataframe(example_tweets)

In [11]:
def remove_stopwords_dataframe(dataframe):
    text_column = dataframe["text"]
    new_words = []
    
    for i in text_column:
        stop_words = set(stopwords.words("english"))
        stop_text = [j for j in i if not j in stop_words]
        new_words.append(stop_text)
    
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe

In [12]:
example_tweets = remove_stopwords_dataframe(example_tweets)

In [13]:
def lemmatize_dataframe(dataframe):
    wordnet = WordNetLemmatizer()
    text_column = dataframe["text"]
    new_words = []
    
    for i in text_column:
        lemma = [wordnet.lemmatize(token) for token in i]
        new_words.append(lemma)
        
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe
        

In [14]:
example_tweets = lemmatize_dataframe(example_tweets)

In [15]:
def stemmatize_dataframe(dataframe):
    stemmer = nltk.SnowballStemmer("english")
    text_column = dataframe["text"]
    new_words= []
    
    for i in text_column:
        stemmed = [stemmer.stem(token) for token in i]
        new_words.append(stemmed)
    
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe

In [16]:
example_tweets = stemmatize_dataframe(example_tweets)

In [17]:
def untokenize_dataframe(dataframe):
    text_column = dataframe["text"]
    new_text = []
    
    for i in text_column:
        i = " ".join(i)
        new_text.append(i)
        
    text_column = new_text
    dataframe["text"] = text_column
    
    return dataframe

In [18]:
example_tweets = untokenize_dataframe(example_tweets)

### Sentiment Using TextBlob

In [19]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [20]:
def sentiment_textblob(text):
    analyze = TextBlob(text)

    if analyze.polarity == 0.0:
        sentiment_value = "neutral"
        return sentiment_value, analyze.polarity
    elif analyze.polarity < 0.0:
        sentiment_value = "negative"
        return sentiment_value, analyze.polarity
    elif analyze.polarity > 0.0:
        sentiment_value = "positive"
        return sentiment_value, analyze.polarity

In [21]:
example_tweets.shape

(94, 3)

In [22]:
def add_sentiment_textblob(dataframe):
    text_column = dataframe["text"]
    sentiment_textblob_list = []

    for i in text_column:
        sentiment_value, polarity = s.sentiment_textblob(i)#sentiment_textblob(i)
        sentiment_textblob_list.append(sentiment_value)

    dataframe["sentiment_textblob"] = sentiment_textblob_list
    
    return dataframe

In [23]:
example_tweets = add_sentiment_textblob(example_tweets)

### Sentiment Using NLTK

In [24]:
def add_sentiment_nltk(dataframe):
    text_column = dataframe["text"]
    sentiment_nltk_list = []

    for i in text_column:
        sentiment_value, polarity = s.sentiment_nltk(i)#sentiment_textblob(i)
        sentiment_nltk_list.append(sentiment_value)

    dataframe["sentiment_nltk"] = sentiment_nltk_list
    
    return dataframe

In [25]:
example_tweets = add_sentiment_nltk(example_tweets)

### Sentiment own trained models

In [26]:
def add_sentiment_own(dataframe):
    text_column = dataframe["text"]
    sentiment_own_list = []

    for i in text_column:
        sentiment_value, polarity = s.sentiment(i)#sentiment_textblob(i)
        if polarity < 0.75:
            sentiment_own_list.append("neutral")
        else:
            sentiment_own_list.append(sentiment_value)

    dataframe["sentiment_own_classifiers"] = sentiment_own_list
    
    return dataframe

In [27]:
example_tweets = add_sentiment_own(example_tweets)

### Graph sentiment & BTC time series

In [30]:
import copy
example_tweets_graph = copy.deepcopy(example_tweets)

In [31]:
example_tweets_graph.head()

Unnamed: 0,datetime,text,Language,sentiment_textblob,sentiment_nltk,sentiment_own_classifiers
0,2019-01-01 00:00:00,bitcoin unit price market cap hr vol,en,neutral,neutral,neg
1,2019-01-01 00:00:00,learn s bitcoin lowest averag daili price chan...,en,neutral,negative,neutral
2,2019-01-01 00:00:01,仮想通貨の時価総額 btc 価格 ドミナンス eth 価格 ドミナンス ixt 位 価格 ド...,en,neutral,neutral,neg
3,2019-01-01 00:00:01,iam platform curat retwet via twiter com _alte...,en,positive,neutral,neg
4,2019-01-01 00:00:01,bitcoin btc price chang h market cap rank bitc...,en,negative,neutral,neg


In [32]:
import pandas as pd
example_tweets_graph["datetime"] = pd.to_datetime(example_tweets_graph["datetime"])
example_tweets_graph = example_tweets_graph.set_index("datetime")
example_tweets_graph["hour"] = example_tweets_graph.index.hour

In [33]:
example_tweets_graph.head()

Unnamed: 0_level_0,text,Language,sentiment_textblob,sentiment_nltk,sentiment_own_classifiers,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-01 00:00:00,bitcoin unit price market cap hr vol,en,neutral,neutral,neg,0
2019-01-01 00:00:00,learn s bitcoin lowest averag daili price chan...,en,neutral,negative,neutral,0
2019-01-01 00:00:01,仮想通貨の時価総額 btc 価格 ドミナンス eth 価格 ドミナンス ixt 位 価格 ド...,en,neutral,neutral,neg,0
2019-01-01 00:00:01,iam platform curat retwet via twiter com _alte...,en,positive,neutral,neg,0
2019-01-01 00:00:01,bitcoin btc price chang h market cap rank bitc...,en,negative,neutral,neg,0


In [34]:
pnn_counts_textblob = example_tweets_graph.groupby(["hour", "sentiment_textblob"])["text"].count()
pnn_counts_textblob = pnn_counts_textblob.to_frame()
pnn_counts_textblob = pnn_counts_textblob.reset_index()
pnn_counts_textblob = pnn_counts_textblob.rename(columns= {"text":"textblob_count"})

pnn_counts_nltk = example_tweets_graph.groupby(["hour", "sentiment_nltk"])["text"].count()
pnn_counts_nltk = pnn_counts_nltk.to_frame()
pnn_counts_nltk = pnn_counts_nltk.reset_index()
pnn_counts_nltk = pnn_counts_nltk.rename(columns= {"text":"nltk_count"})

pnn_counts_own = example_tweets_graph.groupby(["hour", "sentiment_own_classifiers"])["text"].count()
pnn_counts_own = pnn_counts_own.to_frame()
pnn_counts_own = pnn_counts_own.reset_index()
pnn_counts_own = pnn_counts_own.rename(columns= {"text":"own_classifier_count"})


In [35]:
textblob_count = pnn_counts_textblob["textblob_count"].tolist()
own_classifier_count = pnn_counts_own["own_classifier_count"].tolist()
pnn_counts_nltk["textblob_count"] = textblob_count
pnn_counts_nltk["own_classifier_count"] = own_classifier_count

pnn_counts = pnn_counts_nltk
pnn_counts = pnn_counts.rename(columns= {"sentiment_nltk":"sentiment"})
pnn_counts.head(9)

Unnamed: 0,hour,sentiment,nltk_count,textblob_count,own_classifier_count
0,0,negative,7,4,76
1,0,neutral,65,69,13
2,0,positive,22,21,5


In [37]:
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2,3, figsize=(45,15))
sns.lineplot(x="hour", y="textblob_count", hue="sentiment", data=pnn_counts, ax=ax[0])
sns.lineplot(x="hour", y="nltk_count", hue="sentiment", data=pnn_counts, ax=ax[1])
sns.lineplot(x="hour", y="own_classifier_count", hue="sentiment", data=pnn_counts, ax=ax[2])
#plt.legend(loc="upper left")

  "sharex argument to subplots() was an integer. "


ValueError: sharex [3] must be one of ['all', 'row', 'col', 'none']

<Figure size 3240x1080 with 0 Axes>