In [73]:
import tweepy
import math
import nltk
import numpy as np
import yfinance as yf
import pandas as pd
from pandas import Series, DataFrame
from pandas.plotting import scatter_matrix
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from datetime import datetime 
from datetime import timedelta
from textblob import TextBlob

In [74]:
def get_data(input_data):
    data = yf.download(input_data, start=datetime(2008, 5, 5), end=datetime.now())
    dff = pd.DataFrame(data)
    return dff

In [75]:
def moving_avg(df):
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    df['HighLoad'] = (df['High'] - df['Close']) / df['Close'] * 100.0
    df['Change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    df = df[['Close', 'HighLoad', 'Change', 'Volume']]

    df['MA10'] = df['Close'].rolling(10).mean()
    df['MA30'] = df['Close'].rolling(30).mean()
    df['MA50'] = df['Close'].rolling(50).mean()

    df['rets'] = df['Close'] / df['Close'].shift(1) - 1

    return df


In [76]:
def make_predictions(df):

    df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    # Drop missing value
    df.fillna(value=-99999, inplace=True)
    # We want to separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(df)))
    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'Adj Close'
    df['label'] = df[forecast_col].shift(-forecast_out)
    X = np.array(df.drop(['label'], 1))
    # Scale the X so that everyone can have the same distribution for linear regression
    X = preprocessing.scale(X)
    # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
    X_forecast = X[-forecast_out:]
    X = X[:-forecast_out]
    # Separate label and identify it as y
    y = np.array(df['label'])
    y = y[:-forecast_out]
    
    return X,y
 
def training(X,y):    
    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    
    # Linear regression
    model = LinearRegression(n_jobs=-1)
    model.fit(X_train, y_train)


    # KNN Regression
    model_knn = KNeighborsRegressor(n_neighbors=2)
    model_knn.fit(X_train, y_train)


    # Bayesian Ridge Regression
    model_by = BayesianRidge()
    model_by.fit(X_train, y_train)
    

    #Create new columns
    forecast_reg = model.predict(X_forecast)
    forecast_knn = model_knn.predict(X_forecast)
    forecast_by = model_by.predict(X_forecast)
    
    return forecast_reg, forecast_knn, forecast_by
    
    
def confident(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    
    # Linear regression
    model = LinearRegression(n_jobs=-1)
    model.fit(X_train, y_train)


    # KNN Regression
    model_knn = KNeighborsRegressor(n_neighbors=2)
    model_knn.fit(X_train, y_train)


    # Bayesian Ridge Regression
    model_by = BayesianRidge()
    model_by.fit(X_train, y_train)
    
    #Create confindence scores
    confidencereg = model.score(X_test, y_test)
    confidence_model_knn = model_knn.score(X_test,y_test)
    confidence_model_by = model_by.score(X_test,y_test)
    

    # # results
    # print('The linear regression confidence is:',confidencereg*100)
    # print('The quadratic regression 2 confidence is:',confidence_model_knn*100)
    # print('The quadratic regression 3 confidence is:',confidence_model_by*100)
    reg = confidencereg * 100
    knn = confidence_model_knn * 100
    by = confidence_model_by * 100
    

    #Process all new columns data
    df['Forecast_reg'] = np.nan

    last_date = df.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df.loc[next_date] = [np.nan for _ in range(len(df.columns))]
        df['Forecast_reg'].loc[next_date] = i
        
    df['Forecast_knn'] = np.nan

    last_date = df.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)
        
    for i in forecast_knn:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df['Forecast_knn'].loc[next_date] = i

    df['forecast_by'] = np.nan

    last_date = df.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)
        
    for i in forecast_by:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df['forecast_by'].loc[next_date] = i
        
    

    return df.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), df['Adj Close'].to_list(), df['Forecast_reg'].to_list(), df['Forecast_knn'].to_list(), df['forecast_by'].to_list()

In [77]:
def confident(score):
    return score

In [78]:
def retrieving_tweets_polarity(symbol):

    consumer_key= '9N4LhWmdtUZT0sbVpgbMyqEY5'
    consumer_secret= '89HtqbHBzmaD9YQSPO6hdU7PQHoMHRSF6NvwxlTCfsY0HZZtZ6'
    access_token='863387980492414976-6ljbFkaBtFMQv3RO8pwAcZGlkI3HXzP'
    access_token_secret='mkxCMzl3p9Eydhenw0iTyN0kKmgnXI8WyI813khmwDSfq'

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    user = tweepy.API(auth)

    tweets = tweepy.Cursor(user.search, q=str(symbol), tweet_mode='extended', lang='en').items(100)

    tweet_list = []
    global_polarity = 0
    for tweet in tweets:
        tw = tweet.full_text
        blob = TextBlob(tw)
        polarity = 0
        for sentence in blob.sentences:
            polarity += sentence.sentiment.polarity
            global_polarity += sentence.sentiment.polarity
        tweet_list.append(tw)

    global_polarity = global_polarity / len(tweet_list)
    return global_polarity

In [79]:
def sentiment(input_value):

    polarity = retrieving_tweets_polarity(input_value)
    if polarity > 0:
        return 'According to the predictions and twitter sentiment analysis -> Investing in "{}" is a GREAT idea!'.format(str(input_value))
    
    elif polarity < 0:
        return 'According to the predictions and twitter sentiment analysis -> Investing in "{}" is a BAD idea!'.format(str(input_value))
            
    return 'According to the predictions and twitter sentiment analysis -> Investing in "{}" is a BAD idea!'.format(str(input_value))

In [80]:
df = get_data('GOOGL')
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-05-05,299.729736,299.799805,293.858856,297.747742,297.747742,12549400
2008-05-06,295.795807,296.296295,291.791779,293.473480,293.473480,9249300
2008-05-07,295.430420,300.045044,288.503510,289.789795,289.789795,13212700
2008-05-08,293.393402,294.944946,289.744751,291.796783,291.796783,10235500
2008-05-09,289.789795,292.792786,285.935944,286.886902,286.886902,8960800
...,...,...,...,...,...,...
2020-04-13,1201.500000,1214.520020,1182.329956,1210.410034,1210.410034,1935100
2020-04-14,1239.969971,1275.750000,1228.540039,1265.229980,1265.229980,3167900
2020-04-15,1246.510010,1275.109985,1234.000000,1257.300049,1257.300049,2111800
2020-04-16,1267.140015,1273.359985,1238.199951,1257.430054,1257.430054,2883100


In [81]:
mov = moving_avg(df)
mov

Unnamed: 0_level_0,Close,HighLoad,Change,Volume,MA10,MA30,MA50,rets
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-05-05,297.747742,0.689195,-0.661261,12549400,,,,
2008-05-06,293.473480,0.961864,-0.785111,9249300,,,,-0.014355
2008-05-07,289.789795,3.538858,-1.909291,13212700,,,,-0.012552
2008-05-08,291.796783,1.078889,-0.544190,10235500,,,,0.006926
2008-05-09,286.886902,2.058610,-1.001724,8960800,,,,-0.016826
...,...,...,...,...,...,...,...,...
2020-04-13,1210.410034,0.339553,0.741576,1935100,1160.981995,1179.157003,1292.034199,0.003183
2020-04-14,1265.229980,0.831471,2.037147,3167900,1172.873987,1175.120671,1288.683198,0.045290
2020-04-15,1257.300049,1.416522,0.865620,2111800,1182.408997,1172.440007,1284.177200,-0.006268
2020-04-16,1257.430054,1.266864,-0.766290,2883100,1197.942004,1168.301009,1280.417600,0.000103


In [82]:
make_predictions(df)
df


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,HL_PCT,PCT_change,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2008-05-05,299.729736,299.799805,293.858856,297.747742,297.747742,12549400,1.995296,-0.661261,281.471466
2008-05-06,295.795807,296.296295,291.791779,293.473480,293.473480,9249300,1.534897,-0.785111,280.380371
2008-05-07,295.430420,300.045044,288.503510,289.789795,289.789795,13212700,3.982726,-1.909291,273.488495
2008-05-08,293.393402,294.944946,289.744751,291.796783,291.796783,10235500,1.782129,-0.544190,272.877869
2008-05-09,289.789795,292.792786,285.935944,286.886902,286.886902,8960800,2.390085,-1.001724,271.421417
...,...,...,...,...,...,...,...,...,...
2020-04-13,1201.500000,1214.520020,1182.329956,1210.410034,1210.410034,1935100,2.659435,0.741576,
2020-04-14,1239.969971,1275.750000,1228.540039,1265.229980,1265.229980,3167900,3.731334,2.037147,
2020-04-15,1246.510010,1275.109985,1234.000000,1257.300049,1257.300049,2111800,3.269704,0.865620,
2020-04-16,1267.140015,1273.359985,1238.199951,1257.430054,1257.430054,2883100,2.796182,-0.766290,


In [83]:
print(confident(s))

NameError: name 's' is not defined

In [34]:
sentiment('GOOGL')

'According to the predictions and twitter sentiment analysis -> Investing in "GOOGL" is a GREAT idea!'