In [1]:
!pwd

/home/su2023grp7/project/UNLV_Procject


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.utils import resample

import time

import seaborn as sns
import matplotlib.pyplot as plt


def main():
    df = pd.read_csv("train.csv")
    df.drop(["Id"], axis=1, inplace=True)

    df.columns = map(str.lower, df.columns)
    df.rename(columns={"married/single": "married_single"}, inplace=True)

    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    for col in cate_cols:
        le = LabelEncoder()
        le = le.fit(df[col])
        df[col] = le.transform(df[col])

    print("Label Encoding-Done.")

    # Down sampling
    subset_0 = df[df["risk_flag"] == 0]
    subset_1 = df[df["risk_flag"] == 1]

    subset_0_downsampled = resample(subset_0,
                                    replace=False,
                                    n_samples=len(subset_1),
                                    random_state=42)

    df = pd.concat([subset_0_downsampled, subset_1])

    print("Down Sampling-Done.")

    X = df.drop(["risk_flag"], axis=1)
    y = df["risk_flag"].apply(lambda x: int(x))

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    clf = svm.SVC()

    # Start: training
    start_time = time.time()

    clf.fit(X_train, y_train)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    y_pred = clf.predict(X_val)

    auc = roc_auc_score(y_val, y_pred)

    print(f"AUC = {auc:.3f}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

Label Encoding-Done.
Down Sampling-Done.
AUC = 0.516
Elapsed Time: 1.0 min 52.73 sec


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col as scol
from pyspark.sql.functions import rand

import time


def main():
    spark = SparkSession.builder.appName("UNLV").getOrCreate()

    df = spark.read.csv("train.csv", header=True, inferSchema=True)
    df = df.drop("Id").withColumnRenamed("married/single", "married_single")
    df = df.toDF(*(col.lower() for col in df.columns))
    df = df.withColumn("risk_flag", scol("risk_flag").cast("integer"))

    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    indexers = [StringIndexer(inputCol=col, outputCol=col + "_idx").fit(df) for col in cate_cols]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)
    df = df.drop(*cate_cols)

    print("Label Encoding-Done.")

    assembler = VectorAssembler(
        inputCols=[col for col in df.columns if col != "risk_flag"],
        outputCol="features"
    )
    df = assembler.transform(df)
    df = df.select(["features", "risk_flag"])

    df.show(5, truncate=False)

    # Down sampling
    pos_count = df.filter("risk_flag = 1").count()
    neg_df = df.filter("risk_flag = 0")
    sampled_neg_df = neg_df.orderBy(rand(seed=42)).limit(pos_count)
    df = sampled_neg_df.union(df.filter("risk_flag = 1"))

    print("Down Sampling-Done.")

    # Define model
    train, val = df.randomSplit([0.8, 0.2], seed=42)

    svm = LinearSVC(labelCol="risk_flag", weightCol="risk_flag", maxIter=100) # default 100

    # Start: training
    start_time = time.time()

    model = svm.fit(train)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    predictions = model.transform(val)

    evaluator = BinaryClassificationEvaluator(labelCol="risk_flag", metricName="areaUnderROC")
    auc = evaluator.evaluate(predictions)

    print(f"AUC = {auc:.3f}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

                                                                                

Label Encoding-Done.
+---------------------------------------------------------+---------+
|features                                                 |risk_flag|
+---------------------------------------------------------+---------+
|[1303834.0,23.0,3.0,3.0,13.0,0.0,11.0,0.0,0.0,151.0,6.0] |0        |
|[7574516.0,40.0,10.0,9.0,13.0,0.0,20.0,0.0,0.0,119.0,1.0]|0        |
|[3991815.0,66.0,4.0,4.0,10.0,1.0,13.0,0.0,0.0,228.0,14.0]|0        |
|[6256451.0,41.0,2.0,2.0,12.0,0.0,20.0,0.0,1.0,287.0,17.0]|1        |
|[5768871.0,47.0,11.0,3.0,14.0,0.0,48.0,0.0,0.0,143.0,5.0]|1        |
+---------------------------------------------------------+---------+
only showing top 5 rows

Down Sampling-Done.


23/07/11 11:19:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

AUC = 0.501
Elapsed Time: 0.0 min 6.32 sec
