In [1]:
import findspark
findspark.find()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

In [2]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext("local", "Simple App")
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [3]:
df = sqlContext.read.parquet("day_data_data2018-12-19 00_00_00.parquet")
type(df)

pyspark.sql.dataframe.DataFrame

In [4]:
df = df.withColumnRenamed("stock_price_col", "label")
df.limit(5).toPandas().head()

Unnamed: 0,index,created_at,id_str,text,truncated,verified,followers_count,favourites_count,label
0,4,2018-12-17 19:06:50,1074818231544832001,RT @Cuds_1246: Maybe they should go look for n...,False,False,30535.0,94052.0,1520.056
1,11,2018-12-17 19:31:14,1074824374119587841,RT @CKJCryptonews: I like this rumour \nAZ and...,False,False,159.0,720.0,1520.056
2,129,2018-12-17 19:09:40,1074818946224685056,RT @GdsCleo: Do you want to know what Sweden’s...,False,False,2087.0,45060.0,1520.056
3,148,2018-12-17 19:17:25,1074820896513318913,RT @GinaGomezDunn: I’m so excited to share my ...,False,False,8810.0,141918.0,1520.056
4,260,2018-12-17 19:14:35,1074820185134120961,RT @KingAntho_: #Silenttribute are so $weet :...,False,False,113.0,47.0,1520.056


In [5]:
df.printSchema()

root
 |-- index: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id_str: long (nullable = true)
 |-- text: string (nullable = true)
 |-- truncated: boolean (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- followers_count: double (nullable = true)
 |-- favourites_count: double (nullable = true)
 |-- label: double (nullable = true)



In [None]:
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def pre_processing(column):
    first_process = re.sub(combined_pat, '', column)
    second_process = re.sub(www_pat, '', first_process)
    third_process = second_process.lower()
    fourth_process = neg_pattern.sub(lambda x: negations_dic[x.group()], third_process)
    result = re.sub(r'[^A-Za-z ]','',fourth_process)
    return result.strip()

In [6]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [7]:
from pyspark.ml.feature import Tokenizer, NGram, CountVectorizer, IDF, VectorAssembler, Binarizer, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
tokenizer = Tokenizer(inputCol='text',outputCol='words')

In [8]:
input_cols = ['1_tfidf', "2_tfidf", "3_tfidf", "4_tfidf", "5_tfidf"]
# "created_at", "id_str_oh", "truncated", "verified", "followers_count", "favourites_count"

ngrams = [NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i)) for i in range(1,6)]
cv = [CountVectorizer(vocabSize=10000, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1,6)]
idf = [IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=10) for i in range(1,6)]
# binarizer1 = [Binarizer(threshold=1.0, inputCol="truncated", outputCol="bi_truncated")]
# binarizer2 = [Binarizer(threshold=1.0, inputCol="verified", outputCol="bi_verified")]
# stringind = [StringIndexer(inputCol="id_str", outputCol="id_str_idx")]
# onehot = [OneHotEncoder(inputCol="id_str_idx", outputCol="id_str_oh")]
assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')]
pipeline = Pipeline(stages=ngrams+cv+idf+assembler)

pipelineFit = pipeline.fit(df)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.limit(5).toPandas().head()

Unnamed: 0,index,created_at,id_str,text,truncated,verified,followers_count,favourites_count,label,words,...,2_tf,3_tf,4_tf,5_tf,1_tfidf,2_tfidf,3_tfidf,4_tfidf,5_tfidf,features
0,4,2018-12-17 19:06:50,1074818231544832001,RT @Cuds_1246: Maybe they should go look for n...,False,False,30535.0,94052.0,1520.056,"[rt, @cuds_1246:, maybe, they, should, go, loo...",...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 0.0, 0.0, 0.0, ..."
1,11,2018-12-17 19:31:14,1074824374119587841,RT @CKJCryptonews: I like this rumour \nAZ and...,False,False,159.0,720.0,1520.056,"[rt, @ckjcryptonews:, i, like, this, rumour, ,...",...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2527294080072462, 0.6167920792271944, 1.108...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2527294080072462, 0.6167920792271944, 1.108..."
2,129,2018-12-17 19:09:40,1074818946224685056,RT @GdsCleo: Do you want to know what Sweden’s...,False,False,2087.0,45060.0,1520.056,"[rt, @gdscleo:, do, you, want, to, know, what,...",...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 1.2945051299572...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.63532893...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 1.2945051299572..."
3,148,2018-12-17 19:17:25,1074820896513318913,RT @GinaGomezDunn: I’m so excited to share my ...,False,False,8810.0,141918.0,1520.056,"[rt, @ginagomezdunn:, i’m, so, excited, to, sh...",...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 0.0, 2.53588935...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.6167920792271944, 0.0, 0.0, 2.53588935..."
4,260,2018-12-17 19:14:35,1074820185134120961,RT @KingAntho_: #Silenttribute are so $weet :...,False,False,113.0,47.0,1520.056,"[rt, @kingantho_:, #silenttribute, , are, so, ...",...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(5.010917632028985, 0.6167920792271944, 0.0, 0...","(2.6674847950899876, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(5.010917632028985, 0.6167920792271944, 0.0, 0..."


In [None]:
# dt = DecisionTreeRegressor(maxDepth=25)
# dt_model = dt.fit(train_df)
# predictions = dt_model.transform(val_df)

# from pyspark.ml.evaluation import RegressionEvaluator
# evaluator = RegressionEvaluator(rawPredictionCol="rawPrediction")
# evaluator.evaluate(predictions)

In [None]:
# predictions.select("prediction", "label", "features").limit(5).toPandas().head()