In [218]:
from hdfs import InsecureClient
from pyspark.sql import SparkSession
import json
import pandas as pd
import pyarrow as pa
import numpy as np
from elasticsearch import Elasticsearch,helpers

In [241]:
#Setting up Elasticsearch Client
es = Elasticsearch(hosts="elasticsearch:9200",http_auth=("elastic","elastic@1234"))

# Listing all files stored at the /dfs-data location

In [74]:
client = InsecureClient("http://hadoop-filebeat:50070",user="nifi")
client.list('/dfs-data')

['108debbf-e7bd-449a-a3f8-109694a7ddff',
 '1f258e72-726a-46ed-81fd-48572bcb20fc',
 '2dce5275-b1b7-4913-b35c-831a14fbdc8f',
 '5665cfd6-acb1-4c4f-9dac-d8a459b9ef63',
 '5df33cb9-bc65-47e4-b43e-b4203392bf6b',
 '60df1b71-987d-4efc-9f6b-8d4a86a61783',
 '8ff72c0e-e243-48c4-b72b-8ae52b264b04',
 '9fbf761d-aab9-46b7-9881-9266de7b7f11',
 'a4374119-7d96-4213-8036-601070c76c55',
 'a94fd971-7ffb-4c6f-beec-546eaf8c355a',
 'a986266b-b498-48dc-ae0a-167a70ab4d3a',
 'ae52e9c8-ce35-462a-ac48-5a0f71a94480',
 'ba9f64d0-09c0-4f7f-85c5-d34a68103a9e',
 'caadfc9a-f5f0-4553-b45a-4c1fdcd3f219',
 'df198eed-12cd-4c93-832f-d06a40a1e273',
 'e35859d5-5795-4482-ab09-25323a70e8f3',
 'e87bf0ac-6753-4c71-a906-68f5cc979173']

# Initializing Spark Session object

In [75]:
spark = (SparkSession
        .builder
        .appName("hdfsprocessor")
        .config("spark.jars","/home/jovyan/spark_conf/elasticsearch-hadoop-7.11.1/dist/elasticsearch-hadoop-7.11.1.jar")
        .config("spark.jars","/home/jovyan/spark_conf/elasticsearch-hadoop-7.11.1/dist/elasticsearch-spark-20_2.11-7.11.1.jar") 
        .config("spark.es.nodes","elasticsearch")
        .config("spark.es.port","9200")
        .getOrCreate())

# Merging all files in a single dataframe

In [94]:
final_df = pd.DataFrame()
for file in client.list("/dfs-data"):
    print(f"Processing file: {file}")
    df_file = spark.read.format("csv").option("delimiter",",").option("header","true").load(f"hdfs://hadoop-filebeat:8020/dfs-data/{file}")
    final_df = pd.concat([final_df,df_file.toPandas()]).reset_index(drop=True)

Processing file: 108debbf-e7bd-449a-a3f8-109694a7ddff
Processing file: 1f258e72-726a-46ed-81fd-48572bcb20fc
Processing file: 2dce5275-b1b7-4913-b35c-831a14fbdc8f
Processing file: 5665cfd6-acb1-4c4f-9dac-d8a459b9ef63
Processing file: 5df33cb9-bc65-47e4-b43e-b4203392bf6b
Processing file: 60df1b71-987d-4efc-9f6b-8d4a86a61783
Processing file: 8ff72c0e-e243-48c4-b72b-8ae52b264b04
Processing file: 9fbf761d-aab9-46b7-9881-9266de7b7f11
Processing file: a4374119-7d96-4213-8036-601070c76c55
Processing file: a94fd971-7ffb-4c6f-beec-546eaf8c355a
Processing file: a986266b-b498-48dc-ae0a-167a70ab4d3a
Processing file: ae52e9c8-ce35-462a-ac48-5a0f71a94480
Processing file: ba9f64d0-09c0-4f7f-85c5-d34a68103a9e
Processing file: caadfc9a-f5f0-4553-b45a-4c1fdcd3f219
Processing file: df198eed-12cd-4c93-832f-d06a40a1e273
Processing file: e35859d5-5795-4482-ab09-25323a70e8f3
Processing file: e87bf0ac-6753-4c71-a906-68f5cc979173


In [95]:
#removing all the duplicate rows
final_df = final_df.drop_duplicates()

# Performing Analysis

In [158]:
#replacing values 
final_df.replace([".","M","F","Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)","Black Non-Hispanic","White Non-Hispanic"],
                 [np.nan,"Male","Female","Accidents Except Drug Poisoning (V01-X39, X43, X45-X59, Y85-Y86)","Non-Hispanic Black","Non-Hispanic White"],
                 inplace=True)

In [159]:
def convert_columns(data_df,columns,fill_anchor_column):
    """
        converting death rates and age adjusted death rates to float types 
        and replacing the nan values with the mean of the group for that year
        
        Input
        
        data_df(pandas.DataFrame): dataframe to convert columns of
        columns(dict): dict containing column names and the type the column to be converted to
        fill_anchor_column(str): name of the anchor colume, by which the dataframe would be grouped 
                            and the filling value would be calculated
        
        Output
        
        copied_df(pandas.DataFrame): processed dataframe
    """
    copied_df = data_df.copy(deep=True)
    for column in columns:
        copied_df[column] = copied_df[column].astype(columns[column])
        copied_df[column] = copied_df.groupby(fill_anchor_column)[column].transform(lambda x:x.fillna(x.mean()))
    return copied_df


def calc_percentages(data_df,target_column):
    """
        Function to calculate percentages of each column value and output pd.Series
        
        Input:
        
        data_df(pd.DataFrame): dataframe to calculate percentages from
        target_column(str): targt column to calculate percentages of
        
        Output:
        pandas.Series: series containing the calculated percentages for target_column of data_df
    """
    return data_df[target_column].apply(lambda x:int(x/data_df[target_column].values[0]*100))

### Some topics for analysis

#### - Analysis across genders
    - What are the leading cause of deaths per year, with their average death count, across genders
    - What are the average deaths per year, across genders
#### - Analysis for race ethnicites
    - What are the leading cause of deaths per year, across race ethnicites

Some notes:
- Death rates and age adjusted death rates are not available for minor and unknown ethnicites


In [160]:
segregated_df = final_df[~final_df['race_ethnicity'].isin(['Not Stated/Unknown','Other Race/Ethnicity'])]
segregated_df = convert_columns(segregated_df,{"deaths":float,"death_rate":float,"age_adjusted_death_rate":float},"year")

In [161]:
sex_analysis_pivot = pd.pivot_table(data=segregated_df,
                       index='year',
                       columns='sex',
                       values='deaths',
                       aggfunc=np.sum,
                       fill_value=0).astype(int)

In [162]:
sex_analysis_pivot['Female_pct_change'] = calc_percentages(sex_analysis_pivot,"Female")
sex_analysis_pivot['Male_pct_change'] = calc_percentages(sex_analysis_pivot,"Male")
sex_analysis_pivot

sex,Female,Male,Female_pct_change,Male_pct_change
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,17530,7865,100,100
2011,29545,27026,168,343
2012,30134,27394,171,348
2013,28267,27212,161,345
2014,27627,25710,157,326
2015,27076,26067,154,331
2016,26769,26468,152,336
2017,26765,26553,152,337


- The above table suggests that for females, the total death counts shows a incline of 52% from 17530 to 26765, while for males the average death counts have more than tripled (surge of 237%), from 7865 to 26553. 
- This shows that the matter of concern is much higher in males as compared to females

### Lets dive deep to understand the reason for deaths amongst the genders

In [163]:
leading_cause_df = segregated_df.groupby(by=["year","sex"])[['leading_cause','deaths']].max().reset_index()
leading_cause_df[leading_cause_df['sex']=='Female']

Unnamed: 0,year,sex,leading_cause,deaths
0,2010,Female,Mental and Behavioral Disorders due to Acciden...,5351.0
2,2011,Female,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",5016.0
4,2012,Female,Septicemia (A40-A41),4719.0
6,2013,Female,Viral Hepatitis (B15-B19),4535.0
8,2014,Female,Septicemia (A40-A41),4507.0
10,2015,Female,Septicemia (A40-A41),4494.0
12,2016,Female,Septicemia (A40-A41),4306.0
14,2017,Female,Septicemia (A40-A41),4279.0


In [164]:
leading_cause_df[leading_cause_df['sex']=='Male']

Unnamed: 0,year,sex,leading_cause,deaths
1,2010,Male,Mental and Behavioral Disorders due to Acciden...,1354.0
3,2011,Male,Mental and Behavioral Disorders due to Acciden...,4220.0
5,2012,Male,Septicemia (A40-A41),4156.0
7,2013,Male,Mental and Behavioral Disorders due to Acciden...,4085.0
9,2014,Male,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",3990.0
11,2015,Male,Viral Hepatitis (B15-B19),4015.0
13,2016,Male,Peptic Ulcer (K25-K28),4054.0
15,2017,Male,Septicemia (A40-A41),4082.0


- The above analysis shows that for females, the leading cause of death in recent years has been Septicemia, a disease which can occur through urinary tract infections, pneumonia, kidney infections etc.
- As compared, the causes are quite different for men, between Mental and Behavioral disorders due to poisoning and other psychoactive substance use, viral Hepatitis, Peptic Ulcer and Septicemia

### Analysis of the reasons leading to the disease
- A quick look on the internet and we find out that Septicemia is caused by lung and kidney infection, body organs affected by smoking and alcohol consumption. Furthermore, psychoactive substances do include alcohol and nicotine(present in cigarettes).
- In addition risk factors of peptic ulcers include smoking and alcohol consumption. All these suggest that <b>alcohol consumption and smoking have been the major reasons causing deaths in the last 7 years</b>
- Also, looking at the surge of death counts, while the leading cause being the same, it is quite possible that alcohol consumption and smoking have increased in NYC in the span of 7 years

### Let's analyze the death rates in context of race ethnicites

In [167]:
race_ethnicity_pivot = pd.pivot_table(data=segregated_df,
                                   index='year',
                                   columns='race_ethnicity',
                                   values='deaths',
                                   aggfunc=np.sum,
                                   fill_value=0).astype(int)

In [172]:
race_ethnicity_pivot['Asian_Pacific_pct_change'] = calc_percentages(race_ethnicity_pivot,"Asian and Pacific Islander")
race_ethnicity_pivot['Hispanic_pct_change'] = calc_percentages(race_ethnicity_pivot,"Hispanic")
race_ethnicity_pivot['Non-Hispanic_Black_pct_change'] = calc_percentages(race_ethnicity_pivot,"Non-Hispanic Black")
race_ethnicity_pivot['Non-Hispanic_White_pct_change'] = calc_percentages(race_ethnicity_pivot,"Non-Hispanic White")
race_ethnicity_pivot['Other_Race_pct_change'] = calc_percentages(race_ethnicity_pivot,"Other Race/ Ethnicity")
race_ethnicity_pivot

race_ethnicity,Asian and Pacific Islander,Hispanic,Non-Hispanic Black,Non-Hispanic White,Other Race/ Ethnicity,Asian_Pacific_pct_change,Hispanic_pct_change,Non-Hispanic_Black_pct_change,Non-Hispanic_White_pct_change,Other_Race_pct_change
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010,1125,6387,4720,10347,2816,100,100,100,100,100
2011,3341,9391,13910,25549,4381,296,147,294,246,155
2012,3446,9420,13864,24904,5895,306,147,293,240,209
2013,3651,9672,13911,24891,3354,324,151,294,240,119
2014,3880,9687,13755,24533,1482,344,151,291,237,52
2015,4078,10182,14178,24568,137,362,159,300,237,4
2016,4252,10465,14239,24146,135,377,163,301,233,4
2017,4524,10637,14347,23679,131,402,166,303,228,4


- The above analysis shows that Asians and non-Hispanic black people are the two ethnicities where the surge in death counts is highest. The death counts in asians have more than quadrupled, from 1125 to 4524. Whereas, for non-hispanic black people, the death counts have more than tripled.

- Non-hispanic white people have accounted for the most deaths amongst all race ethnicities.

- The data for other races is inconsistent and any insights generated based on that data would be inconsistent.

### Let's do analysis based on the death rate amongst ethnicities

In [208]:
ethnicity_death_rate_pivot = segregated_df.groupby(by=["race_ethnicity","year","leading_cause"])[['death_rate']].mean().reset_index()
ethnicity_death_rate_pivot[ethnicity_death_rate_pivot['race_ethnicity']=="Asian and Pacific Islander"].groupby("year").max()

Unnamed: 0_level_0,race_ethnicity,leading_cause,death_rate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,Asian and Pacific Islander,Malignant Neoplasms (Cancer: C00-C97),73.6
2011,Asian and Pacific Islander,Malignant Neoplasms (Cancer: C00-C97),91.6
2012,Asian and Pacific Islander,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",97.0
2013,Asian and Pacific Islander,Viral Hepatitis (B15-B19),96.0
2014,Asian and Pacific Islander,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",97.35
2015,Asian and Pacific Islander,Parkinson's Disease (G20),96.383551
2016,Asian and Pacific Islander,Malignant Neoplasms (Cancer: C00-C97),100.725757
2017,Asian and Pacific Islander,Malignant Neoplasms (Cancer: C00-C97),108.319698


In [210]:
ethnicity_death_rate_pivot[ethnicity_death_rate_pivot['race_ethnicity']=="Non-Hispanic Black"].groupby("year").max()

Unnamed: 0_level_0,race_ethnicity,leading_cause,death_rate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,Non-Hispanic Black,Human Immunodeficiency Virus Disease (HIV: B20...,217.7
2011,Non-Hispanic Black,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",215.0
2012,Non-Hispanic Black,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",221.95
2013,Non-Hispanic Black,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",222.05
2014,Non-Hispanic Black,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",217.95
2015,Non-Hispanic Black,"Nephritis, Nephrotic Syndrome and Nephrisis (N...",229.962208
2016,Non-Hispanic Black,Mental and Behavioral Disorders due to Acciden...,233.237448
2017,Non-Hispanic Black,Mental and Behavioral Disorders due to Acciden...,243.860108


In [211]:
ethnicity_death_rate_pivot[ethnicity_death_rate_pivot['race_ethnicity']=="Non-Hispanic White"].groupby("year").max()

Unnamed: 0_level_0,race_ethnicity,leading_cause,death_rate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,Non-Hispanic White,Malignant Neoplasms (Cancer: C00-C97),374.2
2011,Non-Hispanic White,Mental and Behavioral Disorders due to Acciden...,335.25
2012,Non-Hispanic White,Septicemia (A40-A41),321.5
2013,Non-Hispanic White,Septicemia (A40-A41),311.25
2014,Non-Hispanic White,Septicemia (A40-A41),307.55
2015,Non-Hispanic White,Septicemia (A40-A41),308.206813
2016,Non-Hispanic White,Mental and Behavioral Disorders due to Acciden...,305.379117
2017,Non-Hispanic White,Mental and Behavioral Disorders due to Acciden...,303.908625


### Analysis of the death rates amongst ethnicities
- The above pivots show that the major reasons for deaths amongst Asians is Cancer. However, the death rate has increased by 47% from 73.6 in 2010 to 108.31 in 2017.
- For non-Hispanic Black people, Nephritis, Nephrotic Syndrome and Nephrisis have been the major reason for death rates for several years.
- <b>The death rate data for non-Hispanic White people is very representative of the overall population amongst the genders</b>, where Septicemia and mental and behavioral disorders due to overdose are major reasons for deaths
- The <b>death rate</b> for the most prevelant reason amongst <b>non-Hispanic White people</b> has actually <b>decreased</b> along the years. But the <b>actual death count has increased by 128%</b>. This suggests that there is not a single major reason driving the death counts amongst white people, but several causes that are critical

# Saving csv files to /csv-data in hdfs

listing files to save
- segregated_df
- sex_analysis_pivot
- leading_cause_df
- race_ethnicity_pivot
- ethnicity_death_rate_pivot

In [225]:
def save_dataframe_to_hdfs(data_df,target_file_name,host,port,path):
    """
        Function to store dataframe in hdfs
        
        Input:
        
        data_df(pandas.DataFrame): dataframe to write to hdfs
        host: url where hdfs is hosted
        port: port on which hdfs service is running
        path: the directory path to store dataframe to
    """
    sdf = spark.createDataFrame(data_df)
    sdf.write.format("csv").save(f"{host}:{port}/{path}/{target_file_name}")

In [226]:
save_dataframe_to_hdfs(segregated_df,'segregated_df.csv','hdfs://hadoop-filebeat','8020','csv-data')
save_dataframe_to_hdfs(sex_analysis_pivot,'sex_analysis_pivot.csv','hdfs://hadoop-filebeat','8020','csv-data')
save_dataframe_to_hdfs(leading_cause_df,'leading_cause_df.csv','hdfs://hadoop-filebeat','8020','csv-data')
save_dataframe_to_hdfs(race_ethnicity_pivot,'race_ethnicity_pivot.csv','hdfs://hadoop-filebeat','8020','csv-data')
save_dataframe_to_hdfs(ethnicity_death_rate_pivot,'ethnicity_death_rate_pivot.csv','hdfs://hadoop-filebeat','8020','csv-data')

AnalysisException: path hdfs://hadoop-filebeat:8020/csv-data/segregated_df.csv already exists.

## Inserting documents into Elasticsearch cluster

In [294]:
def save_dataframe_to_elasticsearch(data_df,es_client,target_index_name):
    """
        Function to save pandas dataframe to elasticsearch
        
        Input:
        data_df(pandas.DataFrame): pandas DataFrame to store in elasticsearch
        es_client: elasticsearch client to store data
        target_index_name: name of index in elasticsearch cluster to put the data onto
        keys_to_use: columns name in dataframe to fetch
    """

    def doc_generator(data_df):
        data_df_iter = data_df.iterrows()
        for index, document in data_df_iter:
            yield{
                "_index":target_index_name,
                "_id": str(index),
                "_source": {key: document[key] for key in document.keys()}
            }

    helpers.bulk(es_client,doc_generator(data_df))

In [299]:
save_dataframe_to_elasticsearch(segregated_df,es,"segregated_data")
save_dataframe_to_elasticsearch(sex_analysis_pivot.reset_index(),es,"gender_based_analysis_data")
save_dataframe_to_elasticsearch(leading_cause_df,es,"leading_cause_data")
save_dataframe_to_elasticsearch(race_ethnicity_pivot.reset_index(),es,"race_ethnicity_data")
save_dataframe_to_elasticsearch(ethnicity_death_rate_pivot,es,"ethnicity_death_rate_data")