# The Analysis is done on the Titanic Dataset.

### DataLoad for Titanic Dataset

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression


titanic_df = pd.read_csv('titanic-data.csv')


In [None]:

#handling the missing age by removing the records which has missing age. 

titanic_df_modified = titanic_df.loc[titanic_df['Age'].isnull() == False]

#Deciding to leave the values of cabin and embarked as NaN

shape = titanic_df_modified.shape

print ('\nDataframe Rows:{} - Columns:{}\n'.format(shape[0],shape[1]) )

titanic_df_modified.head(5) #this one returns only 714 rows with records having proper age values. 



### Hypothesis Testing

#Type of Test : Chi Square test of Independence (for categorical variable)

#Significance level (α) = 0.05
# degreesofFreedom = 2
# chicritical = 5.99 (for 0.05 significance)

#Data prep for running chi-square test. 

groupby_class = titanic_df.groupby('Survived')['Pclass'].value_counts() #mapping counts of class for survived and not survived

#General parameters

N = 891 #total number of passengers. 
row_ind = ['Survived', 'Not Survived'] #Dependant variable
col_names = ['class1', 'class2', 'class3'] #Independant variable

#creating below a contigency table as a dataFrame with the values obtained above

contigency_df = pd.DataFrame({'class1': [136,80], 'class2': [87, 97], 'class3': [119,372]}, 
                             index = row_ind)


#calculating Marginal and conditional parameters to use later 
marginalvalues = contigency_df.sum(axis = 1)
conditionalvalues = contigency_df.sum(axis = 0)



#function to calculte expected value for each occurence

def expectedvalues_calc(marginal, conditional,N):
    
    res_df = pd.DataFrame(columns = col_names, #creating an empty DF to fill with expected values
                          index = row_ind)

    #filling with expected value for row1 (Survived row)
    res_df.loc['Survived'] = pd.Series(list((conditional*marginal[0])/N), index = col_names)
    #filling with expected value for row2 (Not Survived row)
    res_df.loc['Not Survived'] = pd.Series(list((conditional*marginal[1])/N), index = col_names)
     
    return res_df 

expected_df = expectedvalues_calc(marginalvalues, conditionalvalues,N)
    
#calculating chisquare value below using the expected and given DF

chisquarevalue = ((contigency_df - expected_df)**2/expected_df).values.sum()

print ('Contigency Table \n')
print (contigency_df,'\n')

print ('ChiSqaureValue: {}\n'.format(chisquarevalue))

#according to the Chi Square distribution table p values is

print ("P < 0.00000001")


%pylab inline

#Did class have an effect on Survival rate.?

#splitting the data frame based on the class
first_df = titanic_df.loc[titanic_df.Pclass == 1]
second_df = titanic_df.loc[titanic_df.Pclass == 2]
third_df = titanic_df.loc[titanic_df.Pclass == 3]

group_by_first = first_df.groupby('Survived', as_index = False).count()
group_by_second = second_df.groupby('Survived',as_index = False).count()
group_by_third = third_df.groupby('Survived',as_index = False).count()

bar_width = 0.35

plt.bar([0.5, 1.75],group_by_first['PassengerId'],bar_width, color = '#f48642',label = 'First Class' )

plt.bar([0.5 + bar_width, 1.75 + bar_width], group_by_second['PassengerId'], bar_width, color = 'green',
        label = 'Second Class')

#plt.xticks(group_by_first['Survived']  + 0.5, labels, legend = 'Second Class')

plt.bar([0.85 + bar_width, 1.75 + 2* bar_width], group_by_third['PassengerId'], bar_width, color = '#42f4aa',
       label = 'Third Class')
#plt.xticks(group_by_first['Survived']  + 0.5, labels)

labels_s = ['Not Survived', 'Survived']

plt.xticks(group_by_first['Survived'] + 1, labels_s)

plt.xlabel('Survived or Not')
plt.ylabel('# of Survivors')
plt.title("Survival rate by class")
plt.legend()
plt.show()



In [None]:
# Setting Input and output variables

X = titanic_df_modified.drop("Survived",axis=1)
y = titanic_df_modified["Survived"]

# Divide the dataset into 80/20 ratio. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

#Model Fitting
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)


#Prediction model
predictions = logmodel.predict(X_test)
print(classification_report(y_test, predictions))

#Confusion matrix 
confusion_matrix(y_test, predictions)


Based on the test performed and the visualization we could conclude that passenger class had an effect on whether a passenger survived or not due to the extremenly low probability for such a large chi square value.

## Hence, We reject the Null hypothesis.

## Final Conclusion

Although the data limitation did not allow us to run Z, T-tests, Chi-Square for categorical variable provided some interesting and important insights,

#### Analysis #1 :
The relation between the class(Socio-economic status) and Survival was, in my opinion, the most important one. It was, to a certain level, aligned with my expectation, but the strength of that relation surprised me. 

However, this do not imply any causation, and this should not be used as an indicator to arrive at any future decisions. 

