In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from io import StringIO

warnings.filterwarnings('ignore')

In [None]:
def load_terrorism_file(filename: str) -> pd.DataFrame: #has country,target type
    """
        Load a file efficiently, retaining only the most useful columns & rows.
        Uses Pandas read_csv() with its compression='infer' option.
        
        param filename : terrorism file
        return : dataframe of terrorism file
    """
    
    csv_columns = ['eventid','iyear','imonth','iday','approxdate','extended','resolution','country','country_txt','region','region_txt','provstate','city','latitude','longitude','specificity','vicinity','location','summary','crit1','crit2','crit3','doubtterr','alternative','alternative_txt','multiple','success','suicide','attacktype1','attacktype1_txt','attacktype2','attacktype2_txt','attacktype3','attacktype3_txt','targtype1','targtype1_txt','targsubtype1','targsubtype1_txt','corp1','target1','natlty1','natlty1_txt','targtype2','targtype2_txt','targsubtype2','targsubtype2_txt','corp2','target2','natlty2','natlty2_txt','targtype3','targtype3_txt','targsubtype3','targsubtype3_txt','corp3','target3','natlty3','natlty3_txt','gname','gsubname','gname2','gsubname2','gname3','gsubname3','motive','guncertain1','guncertain2','guncertain3','individual','nperps','nperpcap','claimed','claimmode','claimmode_txt','claim2','claimmode2','claimmode2_txt','claim3','claimmode3','claimmode3_txt','compclaim','weaptype1','weaptype1_txt','weapsubtype1','weapsubtype1_txt','weaptype2','weaptype2_txt','weapsubtype2','weapsubtype2_txt','weaptype3','weaptype3_txt','weapsubtype3','weapsubtype3_txt','weaptype4','weaptype4_txt','weapsubtype4','weapsubtype4_txt','weapdetail','nkill','nkillus','nkillter','nwound','nwoundus','nwoundte','property','propextent','propextent_txt','propvalue','propcomment','ishostkid','nhostkid','nhostkidus','nhours','ndays','divert','kidhijcountry','ransom','ransomamt','ransomamtus','ransompaid','ransompaidus','ransomnote','hostkidoutcome','hostkidoutcome_txt','nreleased','addnotes','scite1','scite2','scite3','dbsource','INT_LOG','INT_IDEO','INT_MISC','INT_ANY','related']
    
    columns_wanted = ['iyear','country_txt','attacktype1_txt','targtype1_txt']
    
    df = pd.read_csv('terrorism.csv',
                    compression='infer',
                    names=csv_columns,
                    usecols=columns_wanted,
                    )
    
    df['iyear'] = pd.to_numeric(df['iyear'], errors='coerce')
    
    return df

In [None]:
def load_perp_file(filename: str) -> pd.DataFrame: 
    """
        Load a file efficiently, retaining only the most useful columns & rows.
        Uses Pandas read_csv() with its compression='infer' option.
        
        param filename : perps.csv
        return : dataframe of perps file
    """

    csv_columns = ['person_id','first_name','last_name','full_name','headshot','headshot_credit','gender','age','inv_informant','inv_public_tip','inv_community_or_family_tip','marital_status','terror_plot','terror_plot_2','plot_id','citizenship_status','charged_or_deceased','year_charged_or_deceased','date_charged','state_charged','state_charged_2','last_residency_state','last_residency_country','char_awlaki','char_contact_with_foreign_militant','char_overseas_military_training','char_us_military_experience','char_online_radicalization','targeted_jews_israel','targeted_military_installation']
    
    columns_wanted = ['first_name','last_name','full_name','age','citizenship_status','marital_status']
    
    df = pd.read_csv('perps.csv',
                    compression='infer',
                    names=csv_columns,
                    usecols=columns_wanted,
                    )   
    
    df['age'] = pd.to_numeric(df['age'],errors='coerce')

    return df

In [None]:
def load_income_file(filename: str) -> pd.DataFrame:
    """
        Cleaning and loading income file to get country, region and income
        
        param filename : CLASS.xls
        return : dataframe of income file    
    """
    
    xls = pd.ExcelFile(r"CLASS.xls")
    sheetX = xls.parse(0, skiprows=5)
    sheetX = sheetX.rename(columns = {'x.2' : 'country_txt','x.5':'Region','x.6':'Income'})
    sheetX[['country_txt','Region','Income']].head(218)
    
    return sheetX

In [None]:
def load_population_data(filename: str) -> pd.DataFrame:
    """
        Loading population data as per required column names for merging the files
        
        param filename : country_population.csv
        return : dataframe of country_population file
    """
    
    population_data = pd.read_csv('country_population.csv',names=['country_txt', 'Country Code', 'Indicator Name', 'Indicator Code',
   '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
   '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
   '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
   '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
   '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
   '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
   '2014', '2015', '2016'])  
    
    return population_data


### Hypothesis 1

In [None]:
def mergeCountryIncome(terrorism_data,income_data):
    """
        This function is used to merge terrorism_data with income_data to include the income column for testing our first hypothesis.
        param terrorism_data : dataframe of terrorism_data
        
        param income_data : dataframe of income_data
        return: dataFrame with country_txt and income columns from specified dataFrames in the function
    """
    
    terrorism_data = terrorism_data.iloc[1:]      # Done for removing the first row causig NaNs in the data.

    result = terrorism_data.merge(income_data, on='country_txt', how='left')
    
    return result


In [None]:
def thirdWorldCountry(countryIncome):
    """
        This function takes the country names and returns two dataframes based on income groups

        param countryIncome: DataFrame of countries with their income groups
        return lowIncome: contains country names, regions and incomes of low income countries
        return lowIncomeCountries: contains year and count of low income countries
    """

    options=['Lower middle income','Low income']

    lowIncome = countryIncome[countryIncome['Income'].isin(options)]

    lowIncomeCountries=lowIncome.groupby(['iyear']).agg(country_countL = ('country_txt','count'))
    
    return lowIncome,lowIncomeCountries


In [None]:
def firstWorldCountry(countryIncome):
    """
        This function takes the country names and returns two dataframes based on income groups

        param countryIncome: DataFrame of countries with their income groups
        return highIncome: contains country names, regions and incomes of high income countries
        return highIncomeCountries: contains year and count of high income countries
    """
    
    options=['High income','Upper middle income']

    highIncome = countryIncome[countryIncome['Income'].isin(options)]

    highIncomeCountries=highIncome.groupby(['iyear']).agg(country_countH = ('country_txt','count'))
    
    highIncomeCountries = highIncomeCountries.iloc[1:]

    
    return highIncome,highIncomeCountries


In [None]:
def lowIncomePopulation(thirdWorldCountries,population_data):
    
    """
        This function calculates population of low income countries year wise
        
        param thirdWorldCountries: contains country names, regions and incomes of third world income countries
        param population_data: dataframe of country_population file
        return h_stack: dataframe of population of required countries by each year
    """
    
    lowIncomePop = thirdWorldCountries.merge(population_data, on='country_txt', how='left')
    
    lowIncomePopln = lowIncomePop.drop_duplicates('country_txt')
    
    reqCol = lowIncomePopln[['country_txt','1970','1971','1972','1973','1974','1975','1976','1977','1978','1979','1980','1981','1982','1983','1984','1985','1986','1987','1988','1989','1990','1991','1992','1993','1994','1995','1996','1997','1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016']]
    
    reqCol.T
    
    c=[]
    d=[]
    for (columnName, columnData) in reqCol.iteritems():
        c.append(columnName)
        d.append(columnData.values.sum())

    dfreq = pd.DataFrame(c,columns = ['iyear'])
    dfreq1 = pd.DataFrame(d,columns = ['Population_low'])
    
    horizontal_stack = pd.concat([dfreq, dfreq1], axis=1)
    horizontal_stack = horizontal_stack.iloc[1:]  
    
    h_stack = horizontal_stack.dropna()
    
    return h_stack

In [None]:
def highIncomePopulation(firstWorldCountries,population_data):
    
    """
        This function calculates population of high income countries year wise
        
        param firstWorldCountries: contains country names, regions and incomes of first world income countries
        param population_data: dataframe of country_population file
        return h_stack: dataframe of population of required countries by each year
    """
    
    highIncomePop = firstWorldCountries.merge(population_data, on='country_txt', how='left')

    highIncomePopln = highIncomePop.drop_duplicates('country_txt')
    
    
    reqCol = highIncomePopln[['country_txt','1970','1971','1972','1973','1974','1975','1976','1977','1978','1979','1980','1981','1982','1983','1984','1985','1986','1987','1988','1989','1990','1991','1992','1993','1994','1995','1996','1997','1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016']]
    
    reqCol.T
    
    a=[]
    b=[]
    for (columnName, columnData) in reqCol.iteritems():
        a.append(columnName)
        b.append(columnData.values.sum())

    dfreqL = pd.DataFrame(a,columns = ['iyear'])
    dfreqL1 = pd.DataFrame(b,columns = ['Population_high'])
    
    horizontal_stack = pd.concat([dfreqL, dfreqL1], axis=1)
    horizontal_stack = horizontal_stack.iloc[1:] 
    

    h_stack = horizontal_stack.dropna()
    return h_stack

In [None]:
def percentFirstWorldCountries(sumPopulationBYearHigh,firstWorldCountries):
    
    """
        This function calculates the division of terrorist activites and total population of countries according to years
        
        param sumPopulationBYearHigh: dataframe of population of required first world countries by each year
        param firstWorldCountries: dataframe of first world country_population file
        return percentfwc: dataframe containing division of terrorist activities by population by year
    """

    sumPopulationBYearHigh['iyear'] = pd.to_numeric(sumPopulationBYearHigh['iyear'], errors='coerce')

    percentfwc = pd.merge(firstWorldCountries,sumPopulationBYearHigh, on='iyear')

    percentfwc['division_high'] = (percentfwc['country_countH']/percentfwc['Population_high'])*100
        
    return percentfwc

In [None]:
def percentThirdWorldCountries(sumPopulationBYearlow,thirdWorldCountries):
    
    """
        This function calculates the division of terrorist activites and total population of countries according to years
        
        param sumPopulationBYearlow: dataframe of population of required third world countries by each year
        param thirdWorldCountries: dataframe of third world country_population file
        return percenttwc: dataframe containing division of terrorist activities by population by year
    """

    sumPopulationBYearlow['iyear'] = pd.to_numeric(sumPopulationBYearlow['iyear'], errors='coerce')
    
    percenttwc = pd.merge(thirdWorldCountries,sumPopulationBYearlow,  on='iyear')
    
    percenttwc['division_low'] = (percenttwc['country_countL']/percenttwc['Population_low'])*100
        
    return percenttwc

In [None]:
def chartPrepData(percentFWCountries,percentTWCountries):
    """
        Merging files based on year to plot chart
        
        param percentFWCountries: dataframe containing division of terrorist activities by population by year for first world countries
        param percentTWCountries: dataframe containing division of terrorist activities by population by year for third world countries
        return result: Dataframe year wise with division of terrorist activities and population
    """
    
    result = percentFWCountries.merge(percentTWCountries, on='iyear', how='inner')
    
    return result
    

In [None]:
#plots the terrorism count of first and third world countries  
ax = percentFWCountries.plot(x='iyear',y='division',color='blue', grid=True, label='First World Countries Terrorism Count');

percentTWCountries.plot(x='iyear',y='division',color='magenta', grid=True, label='Third World Countries Terrorism Count',ax=ax) # https://stackoverflow.com/questions/25386870/pandas-plotting-with-multi-index

ax.legend(loc='First World Countries Terrorism Count')
ax.legend(loc='Third World Countries Terrorism Count')


plt.show()

### Hypothesis 2

In [None]:
def citizenship(terrorism_data,terrorists_data):
    """
        This function is used to merge terrorism_data with terrorists_data to include the citizenship_status column for testing our second hypothesis.
        
        param terrorism_data:dataframe of terrorism file
        param terrorists_data: dataframe of terrorists file
        return: dataFrame with citizenship_status and targtype1_txt columns from specified dataFrames in the function.
    """
    
    citizenshipStatus = pd.merge(terrorism_data,terrorists_data,how='inner', on='country_txt')
    
    return citizenshipStatus



In [None]:
### Filters Government officials from the targtype1_txt column to find which citizenship_status type attacked them the most.
options=['Government (Diplomatic)','Government (General)']

res = citizenshipStatus[citizenshipStatus['targtype1_txt'].isin(options)]

immigrant_status=res.groupby(['citizenship_status']).agg(cit_count = ('targtype1_txt','count'))


x = immigrant_status.index.tolist()
y = immigrant_status['cit_count']

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y)
plt.xticks(rotation=90)
plt.show()


In [None]:
### To check who Nonimmigrant visa holders targeted the most.
options=['Nonimmigrant Visa']

res_4 = ageBins[ageBins['citizenship_status'].isin(options)]

target_type=res_4.groupby(['targtype1_txt']).agg(cit_count = ('citizenship_status','count'))


x = target_type.index.tolist()
y = target_type['cit_count']

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y)
plt.xticks(rotation=90)
plt.show()


In [None]:
def displayPlots(citizenshipStatus):
    """
        This function is used to return a pandas series that denotes the group based on 
        citizenship_status and their count of attacks on Government officials in several countries around the world.
        Data retrieved from: 
        Citizenship and their respective count of attacks on Government officials in the USA:
            
        citizenship_status:cit_count

        Illegal Immigrant:12
        Naturalized Citizen:96
        Nonimmigrant Visa:24
        Permanent Resident:61
        Refugee:12
        Unknown:21
        
        :param countries: List of countries under consideration.
        :param css: Filters the dataframe based on countries mentioned.
        :param css_targetType: Filters the dataframe based on Government officials. 
        :param immigrant_status:Counts the number of targeted government officials grouped by citizenship_status. 
        
        
        
        >>> df_dummy = pd.DataFrame({'Age':  ['25-44', '18-24', '45-64', '<18', '65+']})
        >>> df_dummy.apply(lambda row: age_distribution(row,'Age'), axis=1)
        0    0.272
        1    0.065
        2    0.261
        3    0.232
        4    0.170
        dtype: float64
        """
    
    countries=['United States','Brazil','Mexico','France','Egypt','Jordan','Iran','Syria','Australia','India','Pakistan','China','Philippines']
    for i in countries:
        options=[i]

        css = citizenshipStatus[citizenshipStatus['country_txt'].isin(options)]

        options1=['Government (Diplomatic)','Government (General)']

        css_targetType = css[css['targtype1_txt'].isin(options1)]


        immigrant_status=css_targetType.groupby(['citizenship_status']).agg(countCountry = ('targtype1_txt','count'))

        print(immigrant_status)



        x = immigrant_status.index.tolist()
        y = immigrant_status['countCountry']

        fig = plt.figure()
        ax = fig.add_axes([0,0,1,1])
        ax.bar(x,y)
        plt.xticks(rotation=90)
        ax.set_title(i)
        plt.show()

        

In [None]:
displayPlots(citizenshipStatus)

### Hypothesis 3

In [None]:
def age(citizenshipStatus,perpetrators_data):
    
    """
        This function is used to merge citizenshipStatus with perpetrators_data to include the age column for testing our third hypothesis.
        
        param citizenshipStatus: dataframe of terrorists file
        param perpetrators_data: dataframe of perps file
        return age_df: dataFrame with age and marital_status columns from specified dataFrames in the function
    """
    age_df = pd.merge(citizenshipStatus, perpetrators_data, how='inner', on='citizenship_status')
    
    return age_df

In [None]:
        
ageBins['bin'] = pd.cut(ageBins['age'], bins = [0,10,20,30,40,50,60,70,80],labels = [10,20,30,40,50,60,70,80])

ageBins.head()

options=['Unmarried']

res_1 = ageBins[ageBins['marital_status'].isin(options)]

res_2 = res_1.groupby(['bin']).agg(count = ('marital_status','count'))

x = res_2.index.tolist()
y = res_2['count']

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y)
ax.set_title('World')
plt.show()


In [None]:
def displayAgePlots(ageBinsFilter):
    """
        This function is used to return a pandas series that denotes the age bins of terrorists.
        Age data retrieved from: 
        Age Groups and their respective count:
            
        bin:count
        0-9:0
        10-19:7799
        20-29:19415
        30-39:3263
        40-49:0
        50-59:392
        60-69:572
        70-80:0
        
        :param ageBins:Filters the dataframe based on countries mentioned.
        :param res_1:Filters the dataframe based on marital status (unmarried) as mentioned. 
        :param res_2:Counts the number of unmarried terrorists by age-group. 
        :param row: Denotes that the operation has to be performed across rows
        
        
        >>> df_dummy = pd.DataFrame({'Age':  ['25-44', '18-24', '45-64', '<18', '65+']})
        >>> df_dummy.apply(lambda row: age_distribution(row,'Age'), axis=1)
        0    0.272
        1    0.065
        2    0.261
        3    0.232
        4    0.170
        dtype: float64
        """
    
    countries=['United States','Brazil','Mexico','France','Egypt','Jordan','Iran','Syria','Australia','India','Pakistan','China','Philippines']
    for i in countries:
        
        options=[i]

        ageBins = ageBinsFilter[ageBinsFilter['country_txt'].isin(options)]
        
        ageBins['bin'] = pd.cut(ageBins['age'], bins = [0,10,20,30,40,50,60,70,80],labels = [10,20,30,40,50,60,70,80])

        ageBins.head()

        options=['Unmarried']

        res_1 = ageBins[ageBins['marital_status'].isin(options)]

        res_2 = res_1.groupby(['bin']).agg(count = ('marital_status','count'))

        x = res_2.index.tolist()
        y = res_2['count']
        
        fig = plt.figure()
        ax = fig.add_axes([0,0,1,1])
        ax.bar(x,y)
        ax.set_title(i)
        plt.show()
        

In [None]:
displayAgePlots(ageBins)

In [None]:
#main function
if __name__ == '__main__' :
    
    
    # Loading Data
    
    terrorism_data = load_terrorism_file('terrorism.csv')              # Loading Terrorism data for all countries
    perpetrators_data = load_perp_file('perps.csv')                    # age, marital, cit status
    terrorists_data = pd.read_csv('terrorists.csv')                    # country and cit status
    income_data = load_income_file('CLASS.xls')                        # has income for first and third world countries
    population_data = load_population_data('country_population.csv')   # has population data for each country year wise
    
    
    # Hypothesis 1
    countryIncome = mergeCountryIncome(terrorism_data,income_data)
    
    firstWorldIncome,firstWorldCountries= firstWorldCountry(countryIncome)
    thirdWorldIncome,thirdWorldCountries= thirdWorldCountry(countryIncome)
    
    
    fwcountriesPercentPop = highIncomePopulation(firstWorldIncome,population_data)
    twcountriesPercentPop = lowIncomePopulation(thirdWorldIncome,population_data)
    
    percentFWCountries = percentFirstWorldCountries(fwcountriesPercentPop,firstWorldCountries)
    percentTWCountries = percentThirdWorldCountries(twcountriesPercentPop,thirdWorldCountries)
    
    percentBothType = chartPrepData(percentFWCountries,percentTWCountries)
   

    # Hypothesis 2
    terrorists_data = terrorists_data.rename(columns={'country': 'country_txt'})

    citizenshipStatus = citizenship(terrorism_data,terrorists_data)
    
    
    # Hypothesis 3
    ageBins=age(citizenshipStatus,perpetrators_data)
    