In [97]:
from pyspark.sql import SparkSession

In [98]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [99]:
df = spark.read.csv('master.csv',header = True)

In [100]:
df.show()

+-------+----+------+-----------+-----------+----------+-----------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|
+-------+----+------+-----------+-----------+----------+-----------------+
|Albania|1987|  male|15-24 years|         21|    312900|             6.71|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19|
|Albania|1987|female|15-24 years|         14|    289700|             4.83|
|Albania|1987|  male|  75+ years|          1|     21800|             4.59|
|Albania|1987|  male|25-34 years|          9|    274300|             3.28|
|Albania|1987|female|  75+ years|          1|     35600|             2.81|
|Albania|1987|female|35-54 years|          6|    278800|             2.15|
|Albania|1987|female|25-34 years|          4|    257200|             1.56|
|Albania|1987|  male|55-74 years|          1|    137500|             0.73|
|Albania|1987|female| 5-14 years|          0|    311000|                0|
|Albania|1987|female|55-7

In [101]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: string (nullable = true)
 |-- population: string (nullable = true)
 |-- suicides/100k pop: string (nullable = true)



In [102]:
df.columns

['country',
 'year',
 'sex',
 'age',
 'suicides_no',
 'population',
 'suicides/100k pop']

In [103]:
df.describe().show()

+-------+----------+------------------+------+-----------+------------------+------------------+------------------+
|summary|   country|              year|   sex|        age|       suicides_no|        population| suicides/100k pop|
+-------+----------+------------------+------+-----------+------------------+------------------+------------------+
|  count|     27820|             27820| 27820|      27820|             27820|             27820|             27820|
|   mean|      null|2001.2583752695903|  null|       null|242.57440690150972|1844793.6173975556|12.816097411933894|
| stddev|      null| 8.469055024441408|  null|       null| 902.0479168336403|3911779.4417563565| 18.96151101450326|
|    min|   Albania|              1985|female|15-24 years|                 0|              1000|                 0|
|    max|Uzbekistan|              2016|  male|  75+ years|               999|            999700|             99.99|
+-------+----------+------------------+------+-----------+--------------

In [104]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

In [105]:
data_schema = [StructField('country', StringType(), True),
                  StructField('year', StringType(),True),
              StructField('sex', StringType(),True), StructField('age',StringType(),True),
              StructField('suicides_no', IntegerType(),True), StructField('population', IntegerType(),True),
              StructField('suicides/100k pop', FloatType(), True)]

In [106]:
final_struc = StructType(fields = data_schema)

In [107]:
df = spark.read.csv('master.csv', schema = final_struc)

In [108]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicides/100k pop: float (nullable = true)



In [109]:
df.show(5)

+-------+----+------+-----------+-----------+----------+-----------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|
+-------+----+------+-----------+-----------+----------+-----------------+
|   null|null|  null|       null|       null|      null|             null|
|Albania|1987|  male|15-24 years|         21|    312900|             6.71|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19|
|Albania|1987|female|15-24 years|         14|    289700|             4.83|
|Albania|1987|  male|  75+ years|          1|     21800|             4.59|
+-------+----+------+-----------+-----------+----------+-----------------+
only showing top 5 rows



In [110]:
df.columns

['country',
 'year',
 'sex',
 'age',
 'suicides_no',
 'population',
 'suicides/100k pop']

In [111]:
df.select('age').show(5)

+-----------+
|        age|
+-----------+
|        age|
|15-24 years|
|35-54 years|
|15-24 years|
|  75+ years|
+-----------+
only showing top 5 rows



In [112]:
df.head(2)[0]

Row(country=None, year=None, sex=None, age=None, suicides_no=None, population=None, suicides/100k pop=None)

In [113]:
df.select(['age','population']).show(5)

+-----------+----------+
|        age|population|
+-----------+----------+
|       null|      null|
|15-24 years|    312900|
|35-54 years|    308000|
|15-24 years|    289700|
|  75+ years|     21800|
+-----------+----------+
only showing top 5 rows



In [114]:
df.withColumn('double_suicides',df['suicides_no']*2).show(5)

+-------+----+------+-----------+-----------+----------+-----------------+---------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|double_suicides|
+-------+----+------+-----------+-----------+----------+-----------------+---------------+
|   null|null|  null|       null|       null|      null|             null|           null|
|Albania|1987|  male|15-24 years|         21|    312900|             6.71|             42|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19|             32|
|Albania|1987|female|15-24 years|         14|    289700|             4.83|             28|
|Albania|1987|  male|  75+ years|          1|     21800|             4.59|              2|
+-------+----+------+-----------+-----------+----------+-----------------+---------------+
only showing top 5 rows



In [115]:
df.show()

+-------+----+------+-----------+-----------+----------+-----------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|
+-------+----+------+-----------+-----------+----------+-----------------+
|   null|null|  null|       null|       null|      null|             null|
|Albania|1987|  male|15-24 years|         21|    312900|             6.71|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19|
|Albania|1987|female|15-24 years|         14|    289700|             4.83|
|Albania|1987|  male|  75+ years|          1|     21800|             4.59|
|Albania|1987|  male|25-34 years|          9|    274300|             3.28|
|Albania|1987|female|  75+ years|          1|     35600|             2.81|
|Albania|1987|female|35-54 years|          6|    278800|             2.15|
|Albania|1987|female|25-34 years|          4|    257200|             1.56|
|Albania|1987|  male|55-74 years|          1|    137500|             0.73|
|Albania|1987|female| 5-1

In [116]:
df.createOrReplaceTempView('people')

In [131]:
results = spark.sql('select * from people where year=1987 and sex = "male" and suicides_no=21')

In [132]:
results.show()

+---------+----+----+-----------+-----------+----------+-----------------+
|  country|year| sex|        age|suicides_no|population|suicides/100k pop|
+---------+----+----+-----------+-----------+----------+-----------------+
|  Albania|1987|male|15-24 years|         21|    312900|             6.71|
|Mauritius|1987|male|25-34 years|         21|     91300|             23.0|
|Singapore|1987|male|15-24 years|         21|    253600|             8.28|
+---------+----+----+-----------+-----------+----------+-----------------+

