## Import the Pickle

In [None]:
import pickle
from langdetect import detect
import re
import string
#from project_functions import *
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import sentiment_mod_own_labeled as s

In [None]:
!ls /home/jovyan/work/2_Semester

In [None]:
pickle_tweets = open("/home/jovyan/work/2_Semester/tweets_jan2019", "rb")
example_tweets = pickle.load(pickle_tweets)
example_tweets = example_tweets[["datetime","text"]]
example_tweets.head()

### Take a sample

In [None]:
example_tweets = example_tweets.head(100)

### Filter on English Tweets

In [None]:
def filter_english(dataframe):
    blanco = "blanco"
    text_column = dataframe["text"]
    
    # Create a list saving all the languages of the tweets
    language_list =[]

    for i in text_column:

        try:
            language = detect(i)
            language_list.append(language)
        except:
            language_list.append(blanco)    
    
    dataframe["Language"] = language_list
    
    return dataframe.loc[dataframe['Language'] == "en"]


In [None]:
example_tweets = filter_english(example_tweets)

### Clean Tweets

In [None]:
def removal_function(dataframe):
    new_text = []
    text_column = dataframe["text"]
    for i in text_column:
        y = i

        y = re.sub(r"@[A-Z-a-z-0-9_.]+","", y) #remove users with@
        y = y.replace("\n"," ") # remove enters
        y= re.sub(r"http\S+","",y) # removes links
        y= re.sub("\s+"," ",y)  #removes more one spaces
        y= re.sub(r"&(amp;)", "&", y) # removes and in html format
        y = re.sub(r"[0-9]","",y) #remove numbers
        y=re.sub(r"(.+?)\1+",r"\1",y) #remove repeted letters
        y= re.sub("\s+"," ",y) #remove more one space

        i = y
        new_text.append(i)
        
    dataframe["text"] = new_text
    return dataframe

In [None]:
example_tweets = removal_function(example_tweets)

### Tokenize, Remove Stopwords, Lemmatize, Stemmatize

In [None]:
def tokenize_dataframe(dataframe):
    text_column = dataframe["text"]
    new_text = []
    
    for i in text_column:
        i = i.lower()
        i = RegexpTokenizer(r'\w+').tokenize(i)
        new_text.append(i)
        
    text_column = new_text
    dataframe["text"] = text_column
    
    return dataframe

In [None]:
example_tweets = tokenize_dataframe(example_tweets)

In [None]:
def remove_stopwords_dataframe(dataframe):
    text_column = dataframe["text"]
    new_words = []
    
    for i in text_column:
        stop_words = set(stopwords.words("english"))
        stop_text = [j for j in i if not j in stop_words]
        new_words.append(stop_text)
    
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe

In [None]:
example_tweets = remove_stopwords_dataframe(example_tweets)

In [None]:
def lemmatize_dataframe(dataframe):
    wordnet = WordNetLemmatizer()
    text_column = dataframe["text"]
    new_words = []
    
    for i in text_column:
        lemma = [wordnet.lemmatize(token) for token in i]
        new_words.append(lemma)
        
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe
        

In [None]:
def stemmatize_dataframe(dataframe):
    stemmer = nltk.SnowballStemmer("english")
    text_column = dataframe["text"]
    new_words= []
    
    for i in text_column:
        stemmed = [stemmer.stem(token) for token in i]
        new_words.append(stemmed)
    
    text_column = new_words
    dataframe["text"] = text_column
    
    return dataframe

In [None]:
def untokenize_dataframe(dataframe):
    text_column = dataframe["text"]
    new_text = []
    
    for i in text_column:
        i = " ".join(i)
        new_text.append(i)
        
    text_column = new_text
    dataframe["text"] = text_column
    
    return dataframe

In [None]:
example_tweets = untokenize_dataframe(example_tweets)

### Sentiment Using TextBlob

In [None]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
def add_sentiment_textblob(dataframe):
    text_column = dataframe["text"]
    sentiment_textblob_list = []

    for i in text_column:
        sentiment_value, polarity = s.sentiment_textblob(i)#sentiment_textblob(i)
        sentiment_textblob_list.append(sentiment_value)

    dataframe["sentiment_textblob"] = sentiment_textblob_list
    
    return dataframe

In [None]:
example_tweets = add_sentiment_textblob(example_tweets)

### Sentiment Using NLTK

In [None]:
def add_sentiment_nltk(dataframe):
    text_column = dataframe["text"]
    sentiment_nltk_list = []

    for i in text_column:
        sentiment_value, polarity = s.sentiment_nltk(i)#sentiment_textblob(i)
        sentiment_nltk_list.append(sentiment_value)

    dataframe["sentiment_nltk"] = sentiment_nltk_list
    
    return dataframe

In [None]:
example_tweets = add_sentiment_nltk(example_tweets)

### Sentiment own trained models

In [None]:
pickle_df_sentiment = open("df_tweets/df_nltk.pickle","rb")
df_sentiment = pickle.load(pickle_df_sentiment)

In [None]:
df_sentiment.head()

In [None]:
def add_sentiment_own(dataframe):
    text_column = dataframe["text"]
    sentiment_own_list = []

    for i in text_column:
        sentiment_value, confidence = s.sentiment(i)#sentiment_textblob(i)
        print(i)
        if confidence < 0.7 and confidence > -0.7 :
            sentiment_own_list.append("neutral")
        else:
            sentiment_own_list.append(sentiment_value)

    dataframe["sentiment_own_classifiers"] = sentiment_own_list
    
    return dataframe

In [None]:
df_sentiment = add_sentiment_own(df_sentiment)

In [None]:
df_sentiment.shape

### Graph sentiment & BTC time series

In [None]:
import copy
example_tweets_graph = copy.deepcopy(example_tweets)