In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [17]:
df_food = spark.read.csv('gender_name.csv', inferSchema=True, header=True)

In [18]:
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
Name,James,John,Robert,Michael,William
Gender,M,M,M,M,M
Count,5304407,5260831,4970386,4579950,4226608
Probability,0.0145168,0.0143975,0.0136027,0.0125341,0.0115671


In [19]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- Probability: double (nullable = true)



In [20]:
df.head(10)

[Row(Name='James', Gender='M', Count=5304407, Probability=0.014516787),
 Row(Name='John', Gender='M', Count=5260831, Probability=0.01439753),
 Row(Name='Robert', Gender='M', Count=4970386, Probability=0.013602658),
 Row(Name='Michael', Gender='M', Count=4579950, Probability=0.012534136),
 Row(Name='William', Gender='M', Count=4226608, Probability=0.01156713),
 Row(Name='Mary', Gender='F', Count=4169663, Probability=0.011411287),
 Row(Name='David', Gender='M', Count=3787547, Probability=0.010365534),
 Row(Name='Joseph', Gender='M', Count=2695970, Probability=0.007378171),
 Row(Name='Richard', Gender='M', Count=2638187, Probability=0.007220034),
 Row(Name='Charles', Gender='M', Count=2433540, Probability=0.006659968)]

In [21]:
df.describe().show()

+-------+------+------+------------------+--------------------+
|summary|  Name|Gender|             Count|         Probability|
+-------+------+------+------------------+--------------------+
|  count|147269|147269|            147269|              147269|
|   mean|   NaN|  null|2481.1613442068597|6.790295323639719E-6|
| stddev|   NaN|  null| 46454.71797453446|1.271345193803369...|
|    min|     A|     F|                 1|          2.73674E-9|
|    max| Zzyzx|     M|           5304407|         0.014516787|
+-------+------+------+------------------+--------------------+



In [22]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [26]:
assembler_data = VectorAssembler(inputCols = ['Count', 'Probability'],outputCol="features")
output = assembler_data.transform(df_food)
output.show()

+-----------+------+-------+-----------+--------------------+
|       Name|Gender|  Count|Probability|            features|
+-----------+------+-------+-----------+--------------------+
|      James|     M|5304407|0.014516787|[5304407.0,0.0145...|
|       John|     M|5260831| 0.01439753|[5260831.0,0.0143...|
|     Robert|     M|4970386|0.013602658|[4970386.0,0.0136...|
|    Michael|     M|4579950|0.012534136|[4579950.0,0.0125...|
|    William|     M|4226608| 0.01156713|[4226608.0,0.0115...|
|       Mary|     F|4169663|0.011411287|[4169663.0,0.0114...|
|      David|     M|3787547|0.010365534|[3787547.0,0.0103...|
|     Joseph|     M|2695970|0.007378171|[2695970.0,0.0073...|
|    Richard|     M|2638187|0.007220034|[2638187.0,0.0072...|
|    Charles|     M|2433540|0.006659968|[2433540.0,0.0066...|
|     Thomas|     M|2381034|0.006516273|[2381034.0,0.0065...|
|Christopher|     M|2196198|0.006010425|[2196198.0,0.0060...|
|     Daniel|     M|2039641|0.005581969|[2039641.0,0.0055...|
|    Mat

In [27]:
from pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier

rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')

In [40]:
train, test = output.randomSplit([0.8, 0.2], seed=17)

In [43]:
print('Size of training data :',train.count())
print('Size of testing data :',test.count())

Size of training data : 117662
Size of testing data : 29607
