# Titanic: Machine Learning from Disaster
_Tutorial 1 : Basic Introduction_

# 1) Importing Packages

In [1]:
import pandas as pd


### 2. Load Training and Testing data in panda Data-Frame object.

In [2]:
train=pd.read_csv("./Downloads/train.csv")
test=pd.read_csv("./Downloads/test.csv")

### 3. Understand your Data with describe() ; head() and info() function 

In [3]:
#dataframe.tail(no of records) :: for last (5-default) records
#test.head()
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

### 4. Data Cleaning and preprocesssing  
#### i)Impute --Fill null values   ---> .fillna()
#### ii)Handle non numerical  values -- Clean   data_frame.loc[cond,col_name]=value
#### iii) Deal with categorical 

In [6]:
#looking at missing value
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**Now, how to deal with this missing data?**

**Eliminating** and **Imputing** are two different strategies for handling samples or features with missing values.**

**Eliminating simply we remove** the corresponding features (columns) or samples (rows) from the dataset entirely that have a certain number of missing values. The disadvantages for this strategy, we will run the risk of losing valuable information that our classifier needs to discriminate between classes.

**Imputing** we handle missing data by applying different interpolation techniques to **estimate the missing values**. One of the most common interpolation techniques is *mean imputation*, median.... 

In [7]:
#here count for col Sex in 714 thus replace null value ! 
#set male as 0 and female as 1

train.loc[train["Sex"]=='male',"Sex"]=0
train.loc[train["Sex"]=='female',"Sex"]=1


##now deal with Null value in Age
train.Age=train.Age.fillna(train.Age.median())

##now imputing Embarked col
# 1) Fill null values with 'S'
# 2) fill 'S'->0 ; 'C'-> 'Q'-> 
train.Embarked=train.Embarked.fillna('S')
train.loc[train["Embarked"]=='S',"Embarked"]=0
train.loc[train["Embarked"]=='C',"Embarked"]=1
train.loc[train["Embarked"]=='Q',"Embarked"]=2          

train.describe()
#train[["Sex","Embarked"]]  ###displaying multiple columns....

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### replace missing value i.e nan with 'S' as 'S' is most frequent???
_REAPLACING WITH MEAN ?? REPLACING WITH MODE ?? REPLACING WITH MEDIAN ??_

### iloc vs loc
.loc takes slices based on labels.

    .iloc uses observations’ position.

# List of important methods/tricks
1. * .value_counts()*
2. * .fillna()
3. * .loc[row,col]
4. * train[["Pclass", "Sex", "Age", "Fare"]].* :: selecting multiple columns

# 5 Fitting model : Decision Tree


In [8]:
import numpy as np
from sklearn import tree

# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values
features = train[["Pclass", "Sex", "Age", "Fare"]].values

# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features,target)

# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features,target))

[ 0.12482906  0.31274009  0.24774034  0.31469051]
0.977553310887


# 6) Inference algo 

In [9]:
# first clean  test....
print(test.describe())
#fare has null value...
print(test.head())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800
50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200
75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200
   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles,

In [10]:
#cleaning test...

test.Fare=test.Fare.fillna(test.Fare.median())

test.loc[test["Sex"]=='male',"Sex"]=0
test.loc[test["Sex"]=='female',"Sex"]=1


##now deal with Null value in Age
test.Age=test.Age.fillna(test.Age.median())

##now imputing Embarked col
# 1) Fill null values with 'S'
# 2) fill 'S'->0 ; 'C'-> 'Q'-> 
test.Embarked=test.Embarked.fillna('S')
test.loc[test["Embarked"]=='S',"Embarked"]=0
test.loc[test["Embarked"]=='C',"Embarked"]=1
test.loc[test["Embarked"]=='Q',"Embarked"]=2 


In [11]:


# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values

# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)
#print(my_prediction)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
#print(my_solution)

# Check that your data frame has 418 entries
print(my_solution.shape)

# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])

(418, 1)


# Trying logistic regression..

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import cross_val_score

X=features
y=target


#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# instantiate a logistic regression model, and fit with X and y

model = LogisticRegression()
model.fit(X,y)
# check the accuracy on the training set
print(model.score(X, y))


###############...........INFERENCE.................
predicted = model.predict(test_features)


# generate class probabilities
probs = model.predict_proba(X)
print (probs)




0.795735129068
[[ 0.8878418   0.1121582 ]
 [ 0.10494665  0.89505335]
 [ 0.41885174  0.58114826]
 ..., 
 [ 0.42683603  0.57316397]
 [ 0.52214408  0.47785592]
 [ 0.91313002  0.08686998]]


# As you can see, the classifier is predicting a 1 (having an affair) any time the probability in the second column is greater than 0.5.

# trying Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
#scores = cross_val_score(clf, X, y)
#print(scores.mean())

classifier = clf.fit(X,y)
predictions = classifier.predict_proba(test_features)
#print(predictions)



# Random Forests

#random_forest = RandomForestClassifier(n_estimators=100)

#random_forest.fit(X, y)

#Y_pred = random_forest.predict(test_features)
#random_forest.score(X, y)
#scores = cross_val_score(clf, X, y)
#print(scores.mean())


In [14]:
!conda install -c aterrel xgboost

Fetching package metadata ...........
Solving package specifications: .


UnsatisfiableError: The following specifications were found to be in conflict:
  - python 3.6*
  - xgboost -> numpy 1.9* -> python 2.6* -> openssl 1.0.1*
Use "conda info <package>" to see the dependencies for each package.




In [15]:
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X, y)
predictions = gbm.predict(test_features)
print( predictions)

ModuleNotFoundError: No module named 'xgboost'