# Data normalizer

## Loading all data sources

In [1]:
import pandas as pd
import numpy as np
import json

# Binance processing

In [9]:
def get_binance_df():

    df_binance = pd.read_csv("../data/btcdata-1m.csv")
    df_binance.index = pd.to_datetime(df_binance["time"])
    df_binance.drop(["time", "nothing"], axis = 1, inplace=True)
    
    return df_binance

# Google Trends processing

In [8]:
def get_google_df():
    
    df_google = pd.read_csv("../data/btc-gtrends.csv")
    df_google.index = pd.to_datetime(df_google["date"])
    df_google.drop(["date", "BTCUSD", "BTCUSDT", "isPartial"], axis = 1, inplace=True)
    
    df_google2 = pd.read_csv("../data/eth-gtrends.csv")
    df_google2.index = pd.to_datetime(df_google2["date"])
    df_google2.drop(["date", "ETHUSD", "isPartial"], axis = 1, inplace=True)
    
    df_google3 = pd.read_csv("../data/gen-gtrends.csv")
    df_google3.index = pd.to_datetime(df_google3["date"])
    df_google3.drop(["date", "isPartial"], axis = 1, inplace=True)
    
    df_google = pd.merge(df_google, df_google2, left_index=True, right_index=True)
    df_google = pd.merge(df_google, df_google3, left_index=True, right_index=True)
    
    return df_google    

# Twitter processing

In [7]:
def get_tweets_df():

    tweets = open("../data/tweetsbtc.json", "r")
    lines = [json.loads(line) for line in tweets.readlines()]
    tweets = pd.DataFrame(lines)
    tweets.index = pd.to_datetime(tweets["created_at"])
    tweets.drop("created_at", axis=1, inplace=True)
    
    df_tweets = pd.DataFrame([tweets["polarity"].groupby(pd.TimeGrouper("T")).mean(), 
                         tweets["subjectivity"].groupby(pd.TimeGrouper("T")).mean()])

    df_tweets = pd.DataFrame.transpose(df_tweets)
    
    return df_tweets

In [14]:
tweets.size

6492906

# Merging data

In [13]:
def normalize_data_df():

    df_binance = get_binance_df()
    df_google = get_google_df()
    df_tweets = get_tweets_df()

    df_final = pd.merge(df_binance, df_google, left_index=True, right_index=True)
    df_final = pd.merge(df_final, df_tweets, left_index=True, right_index=True)
    
    df_final = df_final.drop_duplicates(subset="close")
    df_final.to_csv("../data/fulldata.csv")

In [None]:
normalize_data_df()