In [1]:
!pip install pyspark
!pip install findspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=20006df7acfcd6a7d2121df5683f0dfa41d808a5dfdeeb0b074ef6d5ef0069ed
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [36]:
# File location and type
file_location = "/content/adult.data"
file_type = "csv"# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
display(df)

DataFrame[39: int,  State-gov: string,  77516: double,  Bachelors: string,  13: double,  Never-married: string,  Adm-clerical: string,  Not-in-family: string,  White: string,  Male: string,  2174: double,  0: double,  40: double,  United-States: string,  <=50K: string]

In [37]:
# Import pyspark functions
from pyspark.sql import functions as F# Create add new column to the dataset
df = df.withColumn('>50K', F.when(df.income == '<=50K', 0).otherwise(1))# Drop the Income label
df = df.drop('income')# Show dataset's columns
df.columns

AttributeError: ignored

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import (DecisionTreeClassifier, GBTClassifier, RandomForestClassifier, LogisticRegression)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [32]:
# Selecting categorical features
categorical_columns = [
 'workclass',
 'education_level',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'hours-per-week',
 'native-country',
 ]

In [33]:
# The index of string values multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns]# The encode of indexed values multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers]

In [34]:
# Vectorizing encoded values
categorical_encoded = [encoder.getOutputCol() for encoder in encoders]
numerical_columns = ['age', 'education-num', 'capital-gain', 'capital-loss']
inputcols = categorical_encoded + numerical_columns
assembler = VectorAssembler(inputCols=inputcols, outputCol="features")

In [35]:
pipeline = Pipeline(stages=indexers + encoders+[assembler])
model = pipeline.fit(df)
# Transform data
transformed = model.transform(df)
display(transformed)

AnalysisException: ignored

In [17]:
# Transform data
final_data = transformed.select('features', '>50K')

NameError: ignored

In [None]:
# Initialize the classification models
dtc = DecisionTreeClassifier(labelCol='>50K', featuresCol='features')
rfc = RandomForestClassifier(numTrees=150, labelCol='>50K', featuresCol='features')
gbt = GBTClassifier(labelCol='>50K', featuresCol='features', maxIter=10)

In [None]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [None]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [None]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [None]:
my_eval = BinaryClassificationEvaluator(labelCol='>50K')
# Display Decision Tree evaluation metric
print('DTC')
print(my_eval.evaluate(dtc_preds))

In [None]:
# Display Random Forest evaluation metric
print('RFC')
print(my_eval.evaluate(rfc_preds))

In [None]:
# Display Gradien Boosting Tree evaluation metric
print('GBT')
print(my_eval.evaluate(gbt_preds))

In [None]:
# Import libraries
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator# Set the Parameters grid
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())# Iinitializing the cross validator class
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=my_eval, numFolds=5)# Run cross validations.  This can take about 6 minutes since it is training over 20 trees
cvModel = cv.fit(train_data)
gbt_predictions_2 = cvModel.transform(test_data)
my_eval.evaluate(gbt_predictions_2)