In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv('gender_name.csv', inferSchema=True, header=True)

In [4]:
df.show()

+-----------+------+-------+-----------+
|       Name|Gender|  Count|Probability|
+-----------+------+-------+-----------+
|      James|     M|5304407|0.014516787|
|       John|     M|5260831| 0.01439753|
|     Robert|     M|4970386|0.013602658|
|    Michael|     M|4579950|0.012534136|
|    William|     M|4226608| 0.01156713|
|       Mary|     F|4169663|0.011411287|
|      David|     M|3787547|0.010365534|
|     Joseph|     M|2695970|0.007378171|
|    Richard|     M|2638187|0.007220034|
|    Charles|     M|2433540|0.006659968|
|     Thomas|     M|2381034|0.006516273|
|Christopher|     M|2196198|0.006010425|
|     Daniel|     M|2039641|0.005581969|
|    Matthew|     M|1738699|0.004758368|
|  Elizabeth|     F|1704140|0.004663789|
|   Patricia|     F|1608260|0.004401391|
|   Jennifer|     F|1584426|0.004336163|
|    Anthony|     M|1506437|0.004122727|
|     George|     M|1495736|0.004093442|
|      Linda|     F|1480592|0.004051996|
+-----------+------+-------+-----------+
only showing top

In [5]:
df.count()

147269

In [6]:
len(df.columns)

4

In [7]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- Probability: double (nullable = true)



In [8]:
df.describe().show()

+-------+------+------+------------------+--------------------+
|summary|  Name|Gender|             Count|         Probability|
+-------+------+------+------------------+--------------------+
|  count|147269|147269|            147269|              147269|
|   mean|   NaN|  null|2481.1613442068597|6.790295323639719E-6|
| stddev|   NaN|  null| 46454.71797453446|1.271345193803369...|
|    min|     A|     F|                 1|          2.73674E-9|
|    max| Zzyzx|     M|           5304407|         0.014516787|
+-------+------+------+------------------+--------------------+



In [9]:
df.head(10)

[Row(Name='James', Gender='M', Count=5304407, Probability=0.014516787),
 Row(Name='John', Gender='M', Count=5260831, Probability=0.01439753),
 Row(Name='Robert', Gender='M', Count=4970386, Probability=0.013602658),
 Row(Name='Michael', Gender='M', Count=4579950, Probability=0.012534136),
 Row(Name='William', Gender='M', Count=4226608, Probability=0.01156713),
 Row(Name='Mary', Gender='F', Count=4169663, Probability=0.011411287),
 Row(Name='David', Gender='M', Count=3787547, Probability=0.010365534),
 Row(Name='Joseph', Gender='M', Count=2695970, Probability=0.007378171),
 Row(Name='Richard', Gender='M', Count=2638187, Probability=0.007220034),
 Row(Name='Charles', Gender='M', Count=2433540, Probability=0.006659968)]

In [26]:
df.groupby('Probability').count().show()

+-----------+-----+
|Probability|count|
+-----------+-----+
|0.002467508|    1|
| 9.43612E-4|    1|
| 5.93615E-4|    1|
| 4.60793E-4|    1|
| 3.00916E-4|    1|
| 2.81375E-4|    1|
| 2.23597E-4|    1|
| 1.94098E-4|    1|
| 1.34919E-4|    1|
|  1.3456E-4|    1|
| 1.09467E-4|    1|
| 8.09282E-5|    1|
| 4.92887E-5|    1|
| 4.81064E-5|    1|
|  4.0553E-5|    1|
| 3.05557E-5|    1|
| 2.88015E-5|    1|
| 2.71211E-5|    1|
| 2.39985E-5|    1|
| 2.30817E-5|    1|
+-----------+-----+
only showing top 20 rows



In [27]:
df.groupby('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     F|89749|
|     M|57520|
+------+-----+



In [28]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [29]:
df.columns

['Name', 'Gender', 'Count', 'Probability']

In [30]:
input_cols = ['Count', 'Probability']

In [31]:
vec_assembler = VectorAssembler (inputCols= input_cols,
                                outputCol= 'features')

In [32]:
final_data = vec_assembler.transform(df)

In [33]:
final_data.show()

+-----------+------+-------+-----------+--------------------+
|       Name|Gender|  Count|Probability|            features|
+-----------+------+-------+-----------+--------------------+
|      James|     M|5304407|0.014516787|[5304407.0,0.0145...|
|       John|     M|5260831| 0.01439753|[5260831.0,0.0143...|
|     Robert|     M|4970386|0.013602658|[4970386.0,0.0136...|
|    Michael|     M|4579950|0.012534136|[4579950.0,0.0125...|
|    William|     M|4226608| 0.01156713|[4226608.0,0.0115...|
|       Mary|     F|4169663|0.011411287|[4169663.0,0.0114...|
|      David|     M|3787547|0.010365534|[3787547.0,0.0103...|
|     Joseph|     M|2695970|0.007378171|[2695970.0,0.0073...|
|    Richard|     M|2638187|0.007220034|[2638187.0,0.0072...|
|    Charles|     M|2433540|0.006659968|[2433540.0,0.0066...|
|     Thomas|     M|2381034|0.006516273|[2381034.0,0.0065...|
|Christopher|     M|2196198|0.006010425|[2196198.0,0.0060...|
|     Daniel|     M|2039641|0.005581969|[2039641.0,0.0055...|
|    Mat

In [34]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [35]:
kmeans = KMeans(featuresCol = 'features', k=3)

In [36]:
model = kmeans.fit(final_data)

In [37]:
model.transform(final_data).groupby('Probability').count().show()

+-----------+-----+
|Probability|count|
+-----------+-----+
|0.002467508|    1|
| 9.43612E-4|    1|
| 5.93615E-4|    1|
| 4.60793E-4|    1|
| 3.00916E-4|    1|
| 2.81375E-4|    1|
| 2.23597E-4|    1|
| 1.94098E-4|    1|
| 1.34919E-4|    1|
|  1.3456E-4|    1|
| 1.09467E-4|    1|
| 8.09282E-5|    1|
| 4.92887E-5|    1|
| 4.81064E-5|    1|
|  4.0553E-5|    1|
| 3.05557E-5|    1|
| 2.88015E-5|    1|
| 2.71211E-5|    1|
| 2.39985E-5|    1|
| 2.30817E-5|    1|
+-----------+-----+
only showing top 20 rows



In [38]:
predictions = model.transform(final_data)

In [39]:
predictions.show()

+-----------+------+-------+-----------+--------------------+----------+
|       Name|Gender|  Count|Probability|            features|prediction|
+-----------+------+-------+-----------+--------------------+----------+
|      James|     M|5304407|0.014516787|[5304407.0,0.0145...|         1|
|       John|     M|5260831| 0.01439753|[5260831.0,0.0143...|         1|
|     Robert|     M|4970386|0.013602658|[4970386.0,0.0136...|         1|
|    Michael|     M|4579950|0.012534136|[4579950.0,0.0125...|         1|
|    William|     M|4226608| 0.01156713|[4226608.0,0.0115...|         1|
|       Mary|     F|4169663|0.011411287|[4169663.0,0.0114...|         1|
|      David|     M|3787547|0.010365534|[3787547.0,0.0103...|         1|
|     Joseph|     M|2695970|0.007378171|[2695970.0,0.0073...|         2|
|    Richard|     M|2638187|0.007220034|[2638187.0,0.0072...|         2|
|    Charles|     M|2433540|0.006659968|[2433540.0,0.0066...|         2|
|     Thomas|     M|2381034|0.006516273|[2381034.0,

In [41]:
predictions.groupby('Probability', 'prediction').count().show()

+-----------+----------+-----+
|Probability|prediction|count|
+-----------+----------+-----+
| 7.66692E-4|         0|    1|
|  3.5325E-4|         0|    1|
| 3.35426E-4|         0|    1|
| 3.28119E-4|         0|    1|
| 3.27621E-4|         0|    1|
| 1.87122E-4|         0|    1|
| 1.54426E-4|         0|    1|
| 1.11062E-4|         0|    1|
| 8.64454E-5|         0|    1|
| 8.54219E-5|         0|    1|
| 5.60457E-5|         0|    1|
| 5.39083E-5|         0|    1|
| 3.50139E-5|         0|    1|
| 3.09991E-5|         0|    2|
| 3.05694E-5|         0|    1|
| 2.14998E-5|         0|    1|
| 2.07938E-5|         0|    1|
| 1.69897E-5|         0|    2|
| 1.42666E-5|         0|    1|
| 9.71817E-6|         0|    1|
+-----------+----------+-----+
only showing top 20 rows



In [42]:
df.groupby('Probability').count().show()

+-----------+-----+
|Probability|count|
+-----------+-----+
|0.002467508|    1|
| 9.43612E-4|    1|
| 5.93615E-4|    1|
| 4.60793E-4|    1|
| 3.00916E-4|    1|
| 2.81375E-4|    1|
| 2.23597E-4|    1|
| 1.94098E-4|    1|
| 1.34919E-4|    1|
|  1.3456E-4|    1|
| 1.09467E-4|    1|
| 8.09282E-5|    1|
| 4.92887E-5|    1|
| 4.81064E-5|    1|
|  4.0553E-5|    1|
| 3.05557E-5|    1|
| 2.88015E-5|    1|
| 2.71211E-5|    1|
| 2.39985E-5|    1|
| 2.30817E-5|    1|
+-----------+-----+
only showing top 20 rows

