In [104]:
#https://www.dataquest.io/blog/pandas-python-tutorial/  -- very nice tutorial
#https://www.kaggle.com/omarelgabry/a-journey-through-titanic
import pandas as pd
from pandas import Series,DataFrame
titanic_df = pd.read_csv("/home/kushagra/ML-Data/Titanic/train.csv")
test_df    = pd.read_csv("/home/kushagra/ML-Data/Titanic/test.csv")

In [105]:
#view the first few members of dataset
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [106]:
#to get some information about the table
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [107]:
#drop unnecessary columns axis=1 implies that columns and not rows are deleted
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_df    = test_df.drop(['Name','Ticket','Cabin'], axis=1)

In [108]:
#displaying any set of values
titanic_df.iloc[0:5,:]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [109]:
#or you can specify lists too
ls=[0,1,4]
titanic_df.iloc[0:5,ls]
#you can even assign values using this command and then it will not display rather will just assign the value

Unnamed: 0,Survived,Pclass,SibSp
0,0,3,1
1,1,1,1
2,1,3,0
3,1,1,1
4,0,3,0


In [110]:
#to splecify the columns by column names instead of coloumn numbers use the loc method instead of iloc method
ds=titanic_df.loc[0:5,['Survived','Pclass']]
ds.iloc[:,:]#for displaying the whole dataset

Unnamed: 0,Survived,Pclass
0,0,3
1,1,1
2,1,3
3,1,1
4,0,3
5,0,3


In [111]:
#dataframe in pandas can also be created using normal 2-D arrays
#remember that each column in dataframe is a pandas Series object and you can access it like titanic_df["Sex"]
df=pd.DataFrame([[0,1],[1,0]],columns=["A","B"])
df.iloc[:,:]

Unnamed: 0,A,B
0,0,1
1,1,0


In [112]:
#useful since it indicates the obvious correlations in data, you can use this method to remove the very much correlated
#columns since it checks linearity and then tell. Dont include highly correlated features in your training, it might 
#reduce accuracy, this function already ignores string data
titanic_df.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [113]:
#example of corr where I have highly correlated data
import numpy as np
ls=[]
ls1=[]
for i in range(0,100):
    ls.append(i)
    ls1.append(2*i+3)
D2=[ls,ls1]
D2=np.array(D2)
D2=D2.transpose()
df=pd.DataFrame(D2)
df.corr()

Unnamed: 0,0,1
0,1.0,1.0
1,1.0,1.0


In [114]:
#check each element for NaN and finiteness
num_col=8
num_row=891
na=[]
inf=[]
for i in range(0,num_row):
    for j in range(0,num_col):
        a=titanic_df.iloc[i,j]
        if(pd.isnull(a)):
            na.append(i)
        #if(~pd.isfinite(a)):
        #    inf.append(i)
print(len(na))

179


In [116]:
#fill missing values in training data with some value i.e. no value should be NAN
#or you can delete the data point too if you cannot think of a value to give to the data variable
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

In [122]:
#dropping the null values, not a very good idea here since approx 1/5 datapoints have a null entry
with pd.option_context('mode.use_inf_as_null', True): #inserting this line means also treat infinity as null and hence drop it in next line
    titanic_df=titanic_df.dropna()

In [123]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 8 columns):
Survived    714 non-null int64
Pclass      714 non-null int64
Sex         714 non-null object
Age         714 non-null float64
SibSp       714 non-null int64
Parch       714 non-null int64
Fare        714 non-null float64
Embarked    714 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 50.2+ KB


In [None]:
%matplotlib inline #setup matplotlib in jupyter
