In [1]:
import pyspark as spark
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import col,udf,monotonically_increasing_id,unix_timestamp,round,avg
import re
sc = spark.SparkContext()
sql = spark.SQLContext(sc)

In [2]:
df1=pd.read_csv('/home/manoj/ProjectBigData/Data/tweetsfinal.csv',error_bad_lines=False,engine = 'python',header = None) 
df2=pd.read_csv('/home/manoj/ProjectBigData/Data/BitCoinPrice.csv',error_bad_lines=False,engine = 'python',header = None) 
FullDataTw=sql.createDataFrame(df1)
FullDataBtc=sql.createDataFrame(df2) #creating pandas df and then changing it to pyspark df

Skipping line 845142: unexpected end of data


In [3]:
FullDataTw = FullDataTw.dropna() #getting rid of full empty rows
print(FullDataTw.count())
print(FullDataBtc.count())

845141
217


In [4]:
FullDataTw.select(monotonically_increasing_id().alias("rowId"),"*")
FullDataTw = FullDataTw.withColumnRenamed('0', 'DateTime') #setting column names of Twitter dataset
FullDataTw = FullDataTw.withColumnRenamed('1', 'Tweet')
FullDataBtc = FullDataBtc.withColumnRenamed('0', 'DateTime') #setting column names of Bitcoin price dataset
FullDataBtc = FullDataBtc.withColumnRenamed('1', 'Price')
FullDataBtc = FullDataBtc.filter(FullDataBtc.DateTime != 'Date') #to get rid of first row with the header

In [5]:
Tw_samp = FullDataTw #taking sample of 50 rows and working on it

In [6]:
import preprocessor as p #cleaning each tweet using tweet-preprocessor like removing hashtags,urls,emojis....
def function_udf(input_str):
    input_str = re.sub(r'RT', '', input_str)
    p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION)
    input_str = p.clean(input_str)
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", input_str).split())
func_udf = udf(function_udf, StringType())
CleanDF = Tw_samp.withColumn('CleanedTweets', func_udf(Tw_samp['Tweet']))
CleanDF.show(5)

+--------------------+--------------------+--------------------+
|            DateTime|               Tweet|       CleanedTweets|
+--------------------+--------------------+--------------------+
|Thu Nov 09 17:43:...|RT @Forbes: The F...|The Failure of Se...|
|Thu Nov 09 17:43:...|RT @mindstatex: L...|Lots of love from...|
|Thu Nov 09 17:43:...|RT @LevelNetwork:...|Join our telegram...|
|Thu Nov 09 17:43:...|RT @realsheepwolf...|DIGAF FLOAT 16M T...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [7]:
from textblob import TextBlob  #passing cleaned tweets and getting a sentiment score for each tweet
def senti_score_udf(input_str):
    analysis = TextBlob(input_str)
    return analysis.sentiment.polarity
func_udf2 = udf(senti_score_udf, FloatType())
CleanDF = CleanDF.withColumn('Sentiment_score', func_udf2(CleanDF['CleanedTweets']))
CleanDF.show(5)

+--------------------+--------------------+--------------------+---------------+
|            DateTime|               Tweet|       CleanedTweets|Sentiment_score|
+--------------------+--------------------+--------------------+---------------+
|Thu Nov 09 17:43:...|RT @Forbes: The F...|The Failure of Se...|    -0.18888889|
|Thu Nov 09 17:43:...|RT @mindstatex: L...|Lots of love from...|     0.25833333|
|Thu Nov 09 17:43:...|RT @LevelNetwork:...|Join our telegram...|            0.0|
|Thu Nov 09 17:43:...|RT @realsheepwolf...|DIGAF FLOAT 16M T...|          -0.05|
+--------------------+--------------------+--------------------+---------------+
only showing top 5 rows



In [8]:
def Tw_Time_format(stri):  #manipulating and casting the strings(DateTime) of tweets dataframe to timestamps
    dic = {'Nov':'11','Oct':'10'}
    ans = ''
    ans += stri[-4:]+'-'+ dic[stri[4:7]]+'-'+stri[8:19]
    return ans
func_udf3 = udf(Tw_Time_format,StringType())
CleanDF = CleanDF.withColumn('DateTime_c', func_udf3(CleanDF['DateTime']))
CleanDF = CleanDF.withColumn("DateTime_casted",CleanDF['DateTime_c'].cast(TimestampType()))
CleanDF.show(5)

+--------------------+--------------------+--------------------+---------------+-------------------+-------------------+
|            DateTime|               Tweet|       CleanedTweets|Sentiment_score|         DateTime_c|    DateTime_casted|
+--------------------+--------------------+--------------------+---------------+-------------------+-------------------+
|Thu Nov 09 17:43:...|RT @Forbes: The F...|The Failure of Se...|    -0.18888889|2017-11-09 17:43:41|2017-11-09 17:43:41|
|Thu Nov 09 17:43:...|RT @mindstatex: L...|Lots of love from...|     0.25833333|2017-11-09 17:43:40|2017-11-09 17:43:40|
|Thu Nov 09 17:43:...|RT @LevelNetwork:...|Join our telegram...|            0.0|2017-11-09 17:43:39|2017-11-09 17:43:39|
|Thu Nov 09 17:43:...|RT @realsheepwolf...|DIGAF FLOAT 16M T...|          -0.05|2017-11-09 17:43:39|2017-11-09 17:43:39|
+--------------------+--------------------+--------------------+---------------+-------------------+-------------------+
only showing top 5 rows



In [9]:
FinalTw = CleanDF.selectExpr("DateTime_casted as Date_Time", "CleanedTweets as Cleaned_Tweets","Sentiment_score")
FinalTw.show(5) #selecting necessary columns

+-------------------+--------------------+---------------+
|          Date_Time|      Cleaned_Tweets|Sentiment_score|
+-------------------+--------------------+---------------+
|2017-11-09 17:43:41|The Failure of Se...|    -0.18888889|
|2017-11-09 17:43:40|Lots of love from...|     0.25833333|
|2017-11-09 17:43:39|Join our telegram...|            0.0|
|2017-11-09 17:43:39|DIGAF FLOAT 16M T...|          -0.05|
+-------------------+--------------------+---------------+
only showing top 5 rows



In [10]:
FinalTw.printSchema()

root
 |-- Date_Time: timestamp (nullable = true)
 |-- Cleaned_Tweets: string (nullable = true)
 |-- Sentiment_score: float (nullable = true)



In [11]:
from datetime import datetime 
from dateutil import parser
def Btc_Time_format(input_str): #manipulating and casting the strings(DateTime) of BTC dataframe to timestamps
    input_str = re.sub(r'/17','', input_str)
    input_str = '2017-'+ input_str
    input_str = re.sub(r'/', '-', input_str)
    input_str += ':00'
    return input_str[:10]+""+input_str[10:]
func_udf = udf(Btc_Time_format, StringType())
FullDataBtc = FullDataBtc.withColumn('Cleaned_BTC_Time', func_udf(FullDataBtc['DateTime']))

In [12]:
CleandfBtc = FullDataBtc.withColumn("Cleaned_BTC_Time_New",FullDataBtc['Cleaned_BTC_Time'].cast(TimestampType()))
FinalBtc = CleandfBtc.selectExpr("Cleaned_BTC_Time_New as Date_Time", "Price")
FinalBtc = FinalBtc.withColumn("Price",FinalBtc['Price'].cast(DoubleType()))
FinalBtc.show(5)#In this cell, casting to timesstamp, changing col names and casting price type to double

+-------------------+-------+
|          Date_Time|  Price|
+-------------------+-------+
|2017-10-31 00:00:00|6142.46|
|2017-10-31 01:00:00|6139.47|
|2017-10-31 02:00:00| 6128.2|
|2017-10-31 03:00:00|6130.72|
|2017-10-31 04:00:00|6143.92|
+-------------------+-------+
only showing top 5 rows



In [13]:
FinalBtc.printSchema()

root
 |-- Date_Time: timestamp (nullable = true)
 |-- Price: double (nullable = true)



In [14]:
dt_truncated = ((round(unix_timestamp(col('Date_Time')) / 3600) * 3600).cast('timestamp'))
FinalTw = FinalTw.withColumn('dt_truncated', dt_truncated)
FinalTw = FinalTw.selectExpr("dt_truncated as Date_Time","Cleaned_Tweets","Sentiment_score")
UTC = ((unix_timestamp(col('Date_Time'))+ 5*60*60).cast('timestamp'))
FinalTw = FinalTw.withColumn('UTC', UTC)
FinalTw = FinalTw.selectExpr("UTC as Date_Time","Cleaned_Tweets","Sentiment_score")
FinalTw.show(10)

+-------------------+--------------------+---------------+
|          Date_Time|      Cleaned_Tweets|Sentiment_score|
+-------------------+--------------------+---------------+
|2017-11-09 23:00:00|The Failure of Se...|    -0.18888889|
|2017-11-09 23:00:00|Lots of love from...|     0.25833333|
|2017-11-09 23:00:00|Join our telegram...|            0.0|
|2017-11-09 23:00:00|DIGAF FLOAT 16M T...|          -0.05|
|2017-11-09 23:00:00|My luggage likes ...|            0.0|
|2017-11-09 23:00:00|As Bitcoin become...|           0.55|
|2017-11-09 23:00:00|A crucial feature...|            0.1|
|2017-11-09 23:00:00|As Bitcoin become...|           0.55|
|2017-11-09 23:00:00|As Bitcoin become...|           0.55|
+-------------------+--------------------+---------------+
only showing top 10 rows



In [57]:
FinalTw_avg = FinalTw.select("Date_Time","Sentiment_score").groupBy("Date_Time").agg(avg(col("Sentiment_score")))
FinalTw_avg = FinalTw_avg.selectExpr("Date_Time as date", "`avg(Sentiment_score)` as score")


In [58]:
FinalTw_avg.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- score: double (nullable = true)



In [17]:
# from pyspark.sql import functions as f
# df_with_text = FinalTw.groupby("Date_Time").agg(f.concat_ws(" ", f.collect_list(FinalTw.Cleaned_Tweets)))
# df_with_text.show(5)

#Concat the text together for the next round of sentiment analysis

In [27]:
FinalTw_avg.count()

211

In [59]:
FinalTw_avg = FinalTw_avg.join(FinalBtc, FinalTw_avg.date == FinalBtc.Date_Time)
FinalTw_avg = FinalTw_avg.selectExpr('Date_Time', 'score as Sentiment_score', 'Price').sort(asc("Date_Time"))

# FinalTw_avg.show()

+-------------------+-------------------+-------+
|          Date_Time|    Sentiment_score|  Price|
+-------------------+-------------------+-------+
|2017-10-31 05:00:00|0.09226261767339003|6158.76|
|2017-10-31 06:00:00|0.10376996608248826| 6105.9|
|2017-10-31 07:00:00|0.11169094251431187|6094.36|
|2017-10-31 08:00:00|0.08213433168664136|6125.13|
|2017-10-31 09:00:00|0.08937042968493204| 6165.0|
|2017-10-31 10:00:00|0.10267176426801841|6170.77|
|2017-10-31 11:00:00|0.12814641170317434|6233.74|
|2017-10-31 12:00:00|0.10549754665746006|6201.03|
|2017-10-31 13:00:00|0.13845789734397101|6332.34|
|2017-10-31 14:00:00|0.11779438208175155|6363.13|
|2017-10-31 15:00:00|0.09131997304766154|6365.16|
|2017-10-31 16:00:00|0.09301943334583278|6364.78|
|2017-10-31 17:00:00|0.09206832917391784|6361.79|
|2017-10-31 18:00:00|0.10691324426615845|6335.64|
|2017-10-31 19:00:00|0.12217075088870974|6341.15|
|2017-10-31 20:00:00|0.12817142761147934|6370.08|
|2017-10-31 21:00:00|0.09238581351736849|6393.26|


In [54]:
FinalTw_avg = FinalTw_avg.sort(asc("Date_Time"))

FinalTw_avg.show()

+-------------------+-------------------+-------+
|          Date_Time|    Sentiment_score|  Price|
+-------------------+-------------------+-------+
|2017-10-31 05:00:00|0.09226261767339003|6158.76|
|2017-10-31 06:00:00|0.10376996608248826| 6105.9|
|2017-10-31 07:00:00|0.11169094251431187|6094.36|
|2017-10-31 08:00:00|0.08213433168664136|6125.13|
|2017-10-31 09:00:00|0.08937042968493204| 6165.0|
|2017-10-31 10:00:00|0.10267176426801841|6170.77|
|2017-10-31 11:00:00|0.12814641170317434|6233.74|
|2017-10-31 12:00:00|0.10549754665746006|6201.03|
|2017-10-31 13:00:00|0.13845789734397101|6332.34|
|2017-10-31 14:00:00|0.11779438208175155|6363.13|
|2017-10-31 15:00:00|0.09131997304766154|6365.16|
|2017-10-31 16:00:00|0.09301943334583278|6364.78|
|2017-10-31 17:00:00|0.09206832917391784|6361.79|
|2017-10-31 18:00:00|0.10691324426615845|6335.64|
|2017-10-31 19:00:00|0.12217075088870974|6341.15|
|2017-10-31 20:00:00|0.12817142761147934|6370.08|
|2017-10-31 21:00:00|0.09238581351736849|6393.26|


In [60]:
FinalTw_avg.repartition(1).write.csv("Two.csv") #this will write df to single csv instead of writing diff csv acc to partitions 