In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import LinearSVC as SKLinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC as SparkLinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

random_state = 772659

# Data Desciption

In [2]:
raw_df_path = "adult.tsv"
raw_df = pd.read_csv(raw_df_path, sep="\t")

In [3]:
raw_df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,39.0,50.0,38.0,53.0,28.0,37.0,49.0,52.0,31.0,42.0
workclass,7.0,6.0,4.0,4.0,4.0,4.0,4.0,6.0,4.0,4.0
fnlwgt,77516.0,83311.0,215646.0,234721.0,338409.0,284582.0,160187.0,209642.0,45781.0,159449.0
education,9.0,9.0,11.0,1.0,9.0,12.0,6.0,11.0,12.0,9.0
education-num,13.0,13.0,9.0,7.0,13.0,14.0,5.0,9.0,14.0,13.0
marital-status,4.0,2.0,0.0,2.0,2.0,2.0,3.0,2.0,4.0,2.0
occupation,1.0,4.0,6.0,6.0,10.0,4.0,8.0,4.0,10.0,4.0
relationship,1.0,0.0,1.0,0.0,5.0,5.0,1.0,0.0,1.0,0.0
race,4.0,4.0,4.0,2.0,2.0,4.0,2.0,4.0,4.0,4.0
sex,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


# Data Preprocessing

In [4]:
feature_df = raw_df.drop("target", axis=1)

In [5]:
feature_df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [6]:
feature_df = (feature_df - feature_df.mean())/feature_df.std()

In [7]:
feature_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0.025996,2.137337,-1.061968,-0.332539,1.1365,0.916129,-1.318446,-0.276682,0.39238,0.704213,0.146931,-0.217125,-0.034087,0.289459
1,0.8283,1.454386,-1.007094,-0.332539,1.1365,-0.410393,-0.609312,-0.900843,0.39238,0.704213,-0.144802,-0.217125,-2.213009,0.289459
2,-0.046941,0.088484,0.246031,0.183658,-0.419331,-1.736914,-0.136556,-0.276682,0.39238,0.704213,-0.144802,-0.217125,-0.034087,0.289459
3,1.04711,0.088484,0.426659,-2.397326,-1.197247,-0.410393,-0.136556,-0.900843,-1.971725,0.704213,-0.144802,-0.217125,-0.034087,0.289459
4,-0.776309,0.088484,1.408515,-0.332539,1.1365,-0.410393,0.808957,2.219961,-1.971725,-1.419995,-0.144802,-0.217125,-0.034087,-4.083338


In [8]:
normalized_df = feature_df.copy()
normalized_df["target"] = raw_df["target"]

# Train test split

In [9]:
test_df = normalized_df.sample(frac=0.2, random_state=random_state)
train_df = normalized_df.drop(test_df.index)

# Performance measuring

# Scikit-learn - SVC

## Time measure

In [10]:

sk_train_df = train_df.drop("target", axis=1)
sk_test_df = test_df.drop("target", axis=1)

In [11]:
%%timeit -r 5 
sk_model = SKLinearSVC(max_iter=1000)
sk_model.fit(sk_train_df, train_df["target"])

3.79 s ± 115 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [12]:
sk_model = SKLinearSVC(max_iter=1000)
sk_model.fit(sk_train_df, train_df["target"])

In [13]:
%%timeit -r 5 
sk_model.predict(sk_test_df)

1.93 ms ± 1.04 ms per loop (mean ± std. dev. of 5 runs, 100 loops each)


## Prediction evalualtion

In [14]:
sk_model = SKLinearSVC(max_iter=1000)
sk_model.fit(sk_train_df, train_df["target"])
sk_prediction = sk_model.predict(sk_test_df)

In [15]:
accuracy_score(test_df["target"], sk_prediction)

0.823914823914824

In [16]:
f1_score(test_df["target"], sk_prediction)

0.8917285660329851

# Spark - SVC

In [27]:

#Create PySpark SparkSession
spark = SparkSession.builder.master("spark://spark:7077").appName("Ass3-Q1").config("spark.executor.memory", "6g").getOrCreate()
#Create PySpark DataFrame from Pandas


In [28]:
input_cols = train_df.columns.tolist()[:-1]
transformer = VectorAssembler(inputCols=input_cols, outputCol="features")

def pandas_to_spark(pd_df):
    spark_df = spark.createDataFrame(pd_df)
    result = transformer.transform(spark_df).select("features", "target")
    return result

In [29]:
spark_train_df = pandas_to_spark(train_df)
spark_test_df = pandas_to_spark(test_df)

## Time measure

In [30]:
%%timeit -r 5 
lsvc = SparkLinearSVC(labelCol="target", maxIter=1000)
lsvc.fit(spark_train_df)

                                                                                

7.46 s ± 242 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [21]:
lsvc = SparkLinearSVC(labelCol="target", maxIter=1000)
lsvc = lsvc.fit(spark_train_df)


In [22]:
%%timeit -r 5 
lsvc.transform(spark_test_df)

21.6 ms ± 902 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)


## Prediction evaluation

In [23]:
lsvc = SparkLinearSVC(labelCol="target", maxIter=1000)
lsvc = lsvc.fit(spark_train_df)
pred = lsvc.transform(spark_test_df)

y_pred=pred.select("prediction").collect()
y_orig=pred.select("target").collect()

acc = accuracy_score(y_orig, y_pred)
print("Prediction Accuracy: ", acc)

print("F1 score:")
f1_score(y_pred, y_orig)

23/11/15 22:37:11 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Prediction Accuracy:  0.8108108108108109
F1 score:


0.8864306784660767

In [31]:
spark.stop()