# Binary Classification

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("Assignment").getOrCreate()

In [None]:
train = spark.read.option("sep", "\t").csv("./Assignment-2/train.data")
sol = spark.read.option("sep", "\t").csv("./Assignment-2/train.solution")
sol = sol.toDF('label')
train = train.limit(10000)
sol = sol.limit(10000)
cols = spark.read.csv('./Assignment-2/feature.name', sep= '\t', header= True)
train = train.toDF(*cols.columns)
train = train.withColumnRenamed("#followers","followers") \
        .withColumnRenamed("#favorites","favorites") \
        .withColumnRenamed("#friends","friends")

In [None]:
def convertColumn(df, names, newType):
  for name in names: 
     df = df.withColumn(name, df[name].cast(newType))
  return df

In [None]:
#Changing data types
train = train.drop('tweet_id')
columns = ['followers', 'friends', 'favorites']
train = convertColumn(train, columns, IntegerType())
sol = convertColumn(sol, ['label'], IntegerType())

In [None]:
#Converting labels into binary classes
sol = sol.withColumn("label",F.when(F.col("label")>0,1).otherwise(0))

In [None]:
train_sol = train.select("*")

In [None]:
#Generate a sequencial id and join both dataframes. 
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT row_number() OVER (ORDER BY (SELECT NULL)) as id,* \
          FROM tweets")
sol.createOrReplaceTempView("sol")
sol = spark.sql("SELECT row_number() OVER (ORDER BY (SELECT NULL)) as id,* \
          FROM sol")
train_sol = train_sol.join(sol, train_sol.id == sol.id,how='left')
train_sol = train_sol.drop("id")

In [None]:
#Split entities into positive and negative. 
split_col = F.split(train_sol['sentiment'], ' ')
train_sol = train_sol.withColumn('pos', split_col.getItem(0))
train_sol = train_sol.withColumn('neg', split_col.getItem(1))
train_sol = train_sol.drop("sentiment")
#convert columns to integers
columns = ['pos','neg']
train_sol = convertColumn(train_sol, columns, IntegerType())

In [None]:
#Function that splits each entity and calculate the average of all the entities in one tweet
def enti_score(row):
    lis = []
    for x in row:
        arr=x.split(sep=":")
        #print(arr[2])
        try:
            lis.append(float(arr[2]))
        except:
            continue
    if len(lis) == 0:
        return 0.0
    
    return sum(lis) / len(lis)

In [None]:
#Create the UDF and create a new column with the output of the UDF
cal = F.udf(enti_score, FloatType())
train_sol = train_sol.withColumn("ent_score",cal(F.split("entities","[;]")))
train_sol = train_sol.drop("entities")

In [None]:
#Counting mentions
train_sol = train_sol.withColumn("mentions_count",F.when(F.col("mentions")!='null;',F.size(F.split("mentions"," "))).otherwise(0))#.show()
train_sol = train_sol.drop("mentions")
#Counting urls
train_sol = train_sol.withColumn("url_count",F.when(F.col("urls")!='null;',F.size(F.split("urls",""":\-\:"""))-1).otherwise(0))#.show()
train_sol = train_sol.drop("urls")
#Counting hashtags
train_sol = train_sol.withColumn("hash_count",F.when(F.col("hashtags")!='null;',F.size(F.split("hashtags"," "))).otherwise(0))#.show()
train_sol = train_sol.drop("hashtags")

In [None]:
#Generate a row_number by username ordered Date
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT *,row_number() OVER (PARTITION BY username order by Date) as Rnumb \
          FROM tweets")

#Get the Date from previous tweet
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT t1.*,t2.Date as "'PrevDate'" \
          FROM tweets t1 \
          LEFT JOIN tweets t2 ON t1.username=t2.username and (t1.Rnumb-1)=t2.Rnumb")

#Get the days difference with previous tweet. 
train_sol = train_sol.withColumn("DaysSince",F.datediff(F.col("Date"),F.col("PrevDate")))

#First we generate a FirstTweetDate column with the date of the first tweet made by the user. 
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT t1.*,t2.Date as "'FirstTweetDate'"\
          FROM tweets t1 \
          LEFT JOIN tweets t2 ON t1.username=t2.username and t2.Rnumb=1")

#Second we get the difference between the tweet date and the firstTweetDate. 
train_sol = train_sol.withColumn("DaysSFirst",F.datediff(F.col("Date"),F.col("FirstTweetDate")))

#Change the 0s to 1s, so the first day of tweeting doesnt give division by 0 error. 
train_sol = train_sol.withColumn("DaysSFirst",F.when(F.col("DaysSFirst")==0,1).otherwise(train_sol["DaysSFirst"]))

#Third we divide the number of tweetsAcumm / DaysSinceFirst
train_sol = train_sol.withColumn("TweetsADay",train_sol["Rnumb"]/train_sol["DaysSFirst"])

#Drop useless columns
train_sol = train_sol.drop("PrevDate","DaysSince","FirstTweetDate","DaysSFirst")

In [None]:
#Get the Date from previous tweet
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT t1.*,t2.label as "'PrevRT'" \
          FROM tweets t1 \
          LEFT JOIN tweets t2 ON t1.username=t2.username and (t1.Rnumb-1)=t2.Rnumb")
train_sol = train_sol.withColumn("PrevRT",F.when(F.col("PrevRT").isNull(),0.0).otherwise(train_sol["PrevRT"]))

#Drop useless columns
train_sol = train_sol.drop("Rnumb")

In [None]:
#Generate the day number of week
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
train_sol = train_sol.withColumn("dayNumb",F.date_format(F.col('Date'),'u'))
train_sol = train_sol.withColumn('dayNumb', train_sol['dayNumb'].cast(IntegerType()))

In [None]:
#Droping useless columns
train_sol = train_sol.drop("Date","username","timestamp")

In [None]:
train_sol.columns

## Pearson Correlation Matrix

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

assembler = VectorAssembler(inputCols=train_sol.columns, outputCol='vector_col')
df_vector = assembler.transform(train_sol).select('vector_col')
matrix = Correlation.corr(df_vector, 'vector_col')

In [None]:
import numpy as np
rows = matrix.collect()[0][0].toArray().tolist()
np_array = np.array(rows)
np_array = np.around(np_array, 4)
df = spark.createDataFrame(np_array.tolist(),train_sol.columns)
df.show()

In [None]:
import matplotlib.pyplot as plt
def plot_corr_matrix(correlations,attr,fig_no):
    f=plt.figure(fig_no, figsize=[10,10])
    plt.matshow(correlations, fignum=f.number,vmax=1,vmin=-1)
    plt.xticks(range(0,len(train_sol.columns)),train_sol.columns,fontsize=14,rotation=90)
    plt.yticks(range(0,len(train_sol.columns)),train_sol.columns,fontsize=14)
    cb = plt.colorbar(location='right',shrink=0.8)
    cb.ax.tick_params(labelsize=12)
    plt.title('Correlation Matrix', fontsize=16);

----------Standard Scaling----------

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=train_sol.drop("label").columns, outputCol='features')
inputDF = assembler.transform(train_sol)

In [None]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
scaler = standardScaler.fit(inputDF)
train_sol = scaler.transform(inputDF)
#Replace the previous feature column with the scaled feature. 
train_sol = train_sol.drop("features")
train_sol = train_sol.withColumnRenamed("features_scaled","features")

## Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
train_data, test_data = train_sol.randomSplit([.8,.2],seed=42)
#lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, regParam=0.01,elasticNetParam=0.8)
lrModel = lr.fit(train_data)
predictions = lrModel.transform(test_data)

In [None]:
trainingSummary = lrModel.summary

In [None]:
print(trainingSummary.precisionByLabel)
print(trainingSummary.areaUnderROC)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))
print('Test Area Under PR', evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}))

In [None]:
#Analysing coefficients
cols = train_data.columns[:lrModel.coefficientMatrix.numCols]
for idx,val in enumerate(lrModel.coefficientMatrix.values):
    print(cols[idx],' = ', val)

----------Cross Validation------------

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01 ,0.1, 0.3]) \
    .addGrid(lr.elasticNetParam, [0, .5, .8]) \
    .build()

pipeline = Pipeline(stages=[lr])
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)

In [None]:
cvModel = crossval.fit(train_data)
trainingSummary = cvModel.bestModel.stages[-1].summary

In [None]:
print(trainingSummary.precisionByLabel)
print(trainingSummary.areaUnderROC)

In [None]:
prediction = cvModel.transform(test_data)

In [None]:
best_lr = cvModel.bestModel
cols = train_data.columns[:len(best_lr.stages[-1].coefficients)]
for idx,val in enumerate(best_lr.stages[-1].coefficients):
    print(cols[idx],' = ', val)

In [None]:
best_lr.stages[-1].getRegParam()
best_lr.stages[-1].getElasticNetParam()

## Random Forest Class

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
train_data, test_data = train_sol.randomSplit([.8,.2],seed=42)

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [None]:
trainingSummary = rfModel.summary

In [None]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))
print('Test Area Under PR', evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}))

In [None]:
#Analysing feature importance
cols = train_data.columns[:len(rfModel.featureImportances.values)]
for idx,val in enumerate(rfModel.featureImportances.values):
    print(idx, cols[idx],' = ', val)

In [None]:
import matplotlib.pyplot as plt
importances = rfModel.featureImportances.values
feature_list = train_data.drop("features","label","labels_cat").columns
x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances')

# Milti-label Classification

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("Assignment").getOrCreate()

In [None]:
train = spark.read.option("sep", "\t").csv("./Assignment-2/train.data")
sol = spark.read.option("sep", "\t").csv("./Assignment-2/train.solution")
sol = sol.toDF('label')
train = train.limit(100000)
sol = sol.limit(100000)
cols = spark.read.csv('./Assignment-2/feature.name', sep= '\t', header= True)
train = train.toDF(*cols.columns)
train = train.withColumnRenamed("#followers","followers") \
        .withColumnRenamed("#favorites","favorites") \
        .withColumnRenamed("#friends","friends")

In [None]:
def convertColumn(df, names, newType):
  for name in names: 
     df = df.withColumn(name, df[name].cast(newType))
  return df

In [None]:
#Changing data types
train = train.drop('tweet_id')
columns = ['followers', 'friends', 'favorites']
train = convertColumn(train, columns, IntegerType())
sol = convertColumn(sol, ['label'], IntegerType())

In [None]:
train_sol = train.select("*")

In [None]:
#Generate a sequencial id and join. 
train_sol.createOrReplaceTempView("tweets")
train_sol = spark.sql("SELECT row_number() OVER (ORDER BY (SELECT NULL)) as id,* \
          FROM tweets")
sol.createOrReplaceTempView("sol")
sol = spark.sql("SELECT row_number() OVER (ORDER BY (SELECT NULL)) as id,* \
          FROM sol")
train_sol = train_sol.join(sol, train_sol.id == sol.id,how='left')
train_sol = train_sol.drop("id")

In [None]:
#Remove tweets with 0 retweets.
train_sol = train_sol.filter(train_sol.label>0)

In [None]:
def categories_gen(row):
    if row>0 and row<=4:
        return '1-4';
        #return 0.0
    elif row>4 and row<=20:
        return '5-20';
        #return 1.0
    elif row>20 and row<=80:
        return '21-80';
        return 2.0
    else:
        return '80+';
        #return 3.0

In [None]:
from pyspark.ml.feature import StringIndexer
#Converting int label to categories(string)
#cal = F.udf(categories_gen, FloatType())
cal = F.udf(categories_gen)
train_sol = train_sol.withColumn("labels_cat",cal(F.col("label")))
train_sol = train_sol.drop("label")
#Converting string class into index
string_indexer = StringIndexer(inputCol="labels_cat", outputCol="label")
fitted_indexer = string_indexer.fit(train_sol)
train_sol = fitted_indexer.transform(train_sol)
#train_sol = train_sol.withColumn("labels_cat", train_sol["labels_cat"].cast(DoubleType()))

In [None]:
train_sol.groupby("label","labels_cat").count().orderBy("count").show()

------Undersampling----------

In [None]:
row = train_sol.groupby("labels_cat").count().orderBy("count").take(1)
minSamples = row[0][1]
print(minSamples)

In [None]:
#Create new dataframes with smaller samples (minSamples). 
cat0Ratio=minSamples/train_sol.filter(F.col("labels_cat")==0.0).count()
cat0 = train_sol.filter(F.col("labels_cat")==0.0).sample(withReplacement=False, fraction=cat0Ratio)

cat1Ratio=minSamples/train_sol.filter(F.col("labels_cat")==1.0).count()
cat1 = train_sol.filter(F.col("labels_cat")==1.0).sample(withReplacement=False, fraction=cat1Ratio)

cat2Ratio=minSamples/train_sol.filter(F.col("labels_cat")==2.0).count()
cat2 = train_sol.filter(F.col("labels_cat")==2.0).sample(withReplacement=False, fraction=cat2Ratio)

cat3 = train_sol.filter(F.col("labels_cat")==3.0)

In [None]:
#Merge all undersampled dataframes
train_sol = cat0.union(cat1)
train_sol = train_sol.union(cat2)
train_sol = train_sol.union(cat3)

In [None]:
train_sol.groupby("labels_cat").count().orderBy("labels_cat").show()

-------Feature Engineering--------

In [None]:
split_col = F.split(train_sol['sentiment'], ' ')
train_sol = train_sol.withColumn('pos', split_col.getItem(0))
train_sol = train_sol.withColumn('neg', split_col.getItem(1))
train_sol = train_sol.drop("sentiment")
columns = ['pos','neg']
train_sol = convertColumn(train_sol, columns, IntegerType())

In [None]:
#Function that splits each entity and calculate the average of all the entities in one tweet
def enti_score(row):
    lis = []
    for x in row:
        arr=x.split(sep=":")
        #print(arr[2])
        try:
            lis.append(float(arr[2]))
        except:
            continue
    if len(lis) == 0:
        return 0.0
    
    return sum(lis) / len(lis)

In [None]:
#Create the UDF and create a new column with the output of the UDF
cal = F.udf(enti_score, FloatType())
train_sol = train_sol.withColumn("ent_score",cal(F.split("entities","[;]")))
train_sol = train_sol.drop("entities")

In [None]:
#Counting mentions
train_sol = train_sol.withColumn("mentions_count",F.when(F.col("mentions")!='null;',F.size(F.split("mentions"," "))).otherwise(0))#.show()
train_sol = train_sol.drop("mentions")
#Counting urls
train_sol = train_sol.withColumn("url_count",F.when(F.col("urls")!='null;',F.size(F.split("urls",""":\-\:"""))-1).otherwise(0))#.show()
train_sol = train_sol.drop("urls")
#Counting hashtags
train_sol = train_sol.withColumn("hash_count",F.when(F.col("hashtags")!='null;',F.size(F.split("hashtags"," "))).otherwise(0))#.show()
train_sol = train_sol.drop("hashtags")

In [None]:
train_sol = train_sol.drop("Date","username","timestamp")

In [None]:
train_sol.columns

----------Standard Scaler---------

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
cols = train_sol.drop("labels_cat").columns
assembler = VectorAssembler(inputCols=cols, outputCol='features')
inputDF = assembler.transform(train_sol)#.select('features')

In [None]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
scaler = standardScaler.fit(inputDF)
train_sol = scaler.transform(inputDF)

train_sol = train_sol.drop("features")
train_sol = train_sol.withColumnRenamed("features_scaled","features")

## Random Forest Class

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
train_data, test_data = train_sol.randomSplit([.8,.2],seed=42)

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'labels_cat')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [None]:
trainingSummary = rfModel.summary

In [None]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

In [None]:
print(trainingSummary.truePositiveRateByLabel)
print(trainingSummary.recallByLabel)

In [None]:
?MulticlassClassificationEvaluator

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
prediction = rfModel.transform(test_data)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labels_cat')

In [None]:
print('Test Area Under ROC', evaluator.evaluate(prediction))

------------Feature importance----------

https://people.stat.sc.edu/haigang/improvement.html

In [None]:
rfModel.featureImportances.values
cols = train_data.drop("labels_cat","label").columns
for idx,val in enumerate(rfModel.featureImportances.values):
    print(idx, cols[idx],' = ', val)

In [None]:
import matplotlib.pyplot as plt
importances = rfModel.featureImportances.values
feature_list = train_data.drop("features","label","labels_cat").columns
x_values = list(range(len(importances)))
plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances')

--------------Cross Validation---------------

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'labels_cat')

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [3 ,10 , 20])\
    .build()

pipeline = Pipeline(stages=[rf])
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=10)

In [None]:
cvModel = crossval.fit(train_data)
trainingSummary = cvModel.bestModel.stages[-1].summary

In [None]:
print(trainingSummary.precisionByLabel)
print(trainingSummary.accuracy)

In [None]:
best_lr = cvModel.bestModel
print(best_lr.stages[-1].getMaxBins())
print(best_lr.stages[-1].getMaxDepth())
best_lr.stages[-1].getNumTrees

In [None]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))