<a href="https://colab.research.google.com/github/hiba007/data-science/blob/master/Data_Analysis_of_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Wrangling

In [0]:
import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

combine = [train_df, test_df]

train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
import numpy as np

missing_data = train_df.isnull()
missing_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False


**Count missing values in each column**

In [0]:
for column in missing_data.columns.values.tolist():
  
  print(column)
  print(missing_data[column].value_counts())
  print("")


PassengerId
False    891
Name: PassengerId, dtype: int64

Survived
False    891
Name: Survived, dtype: int64

Pclass
False    891
Name: Pclass, dtype: int64

Name
False    891
Name: Name, dtype: int64

Sex
False    891
Name: Sex, dtype: int64

Age
False    714
True     177
Name: Age, dtype: int64

SibSp
False    891
Name: SibSp, dtype: int64

Parch
False    891
Name: Parch, dtype: int64

Ticket
False    891
Name: Ticket, dtype: int64

Fare
False    891
Name: Fare, dtype: int64

Cabin
True     687
False    204
Name: Cabin, dtype: int64

Embarked
False    889
True       2
Name: Embarked, dtype: int64



**Dropping Cabin column as it contains many NaN values as well as Ticket column because it plays no role in survival**

In [0]:
train_df = train_df.drop(['Cabin', 'Ticket'], axis=1)
test_df = test_df.drop(['Cabin', 'Ticket'], axis=1)

combine=[train_df, test_df]


In [0]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [0]:
missing_test = test_df.isnull()

for column in missing_test.columns.values.tolist():
  
  print(column)
  print(missing_test[column].value_counts())
  print("")


PassengerId
False    418
Name: PassengerId, dtype: int64

Pclass
False    418
Name: Pclass, dtype: int64

Name
False    418
Name: Name, dtype: int64

Sex
False    418
Name: Sex, dtype: int64

Age
False    332
True      86
Name: Age, dtype: int64

SibSp
False    418
Name: SibSp, dtype: int64

Parch
False    418
Name: Parch, dtype: int64

Fare
False    417
True       1
Name: Fare, dtype: int64

Embarked
False    418
Name: Embarked, dtype: int64



**We know that Embarked has two missing values in Train_df not in Test_df so we replace it by frequency **

In [0]:
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [0]:
train_df['Embarked'].value_counts().idxmax()

'S'

In [0]:
train_df['Embarked'].replace(np.nan, 'S', inplace=True)

**Now replacing  NaN values of age by mean**

In [0]:
avg_age = train_df['Age'].astype('float').mean()

train_df['Age'].replace(np.nan, avg_age, inplace=True)

In [0]:
avg_age = test_df['Age'].astype('float').mean()

test_df['Age'].replace(np.nan, avg_age, inplace=True)

In [0]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [0]:
train_dummy = pd.get_dummies(train_df['Sex'])
test_dummy = pd.get_dummies(test_df['Sex'])
train_dummy.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [0]:
Embark_train = pd.get_dummies(train_df['Embarked'])
Embark_test = pd.get_dummies(test_df['Embarked'])



In [0]:
train_df = pd.concat([train_df, train_dummy, Embark_train], axis = 1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0,1,0,0,1


In [0]:
train_df.drop('Sex',axis=1,inplace=True)
train_df.drop('Embarked',axis=1,inplace=True)

In [0]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,0,1,0,0,1


**Now Label Encoding to Test Data  and concatenating with Test data so that ML models work on it **

In [0]:
test_df = pd.concat([test_df, test_dummy, Embark_test], axis = 1)

test_df.drop('Sex',axis=1,inplace=True)
test_df.drop('Embarked',axis=1,inplace=True)

In [0]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,892,3,"Kelly, Mr. James",34.5,0,0,7.8292,0,1,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,7.0,1,0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,9.6875,0,1,0,1,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,8.6625,0,1,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,12.2875,1,0,0,0,1


In [0]:
train_df['Family'] =  train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family'] =  test_df['SibSp'] + test_df['Parch'] + 1

In [0]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Fare,female,male,C,Q,S,Family
0,892,3,"Kelly, Mr. James",34.5,0,0,7.8292,0,1,0,1,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,7.0,1,0,0,0,1,2
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,9.6875,0,1,0,1,0,1
3,895,3,"Wirz, Mr. Albert",27.0,0,0,8.6625,0,1,0,0,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,12.2875,1,0,0,0,1,3


In [0]:
train_df['Is_Alone'] =0
train_df.loc[train_df['Family']==1, 'Is_Alone']=1

In [0]:
test_df['Is_Alone'] =0
test_df.loc[test_df['Family']==1, 'Is_Alone']=1

**Now we are depending on Is_Alone column and dropping SibSp, Parch and Family Column**

In [0]:
train_df.drop(['SibSp', 'Parch', 'Family'], axis=1, inplace=True)

In [0]:
test_df.drop(['SibSp', 'Parch', 'Family'], axis=1, inplace=True)

In [0]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Fare,female,male,C,Q,S,Is_Alone
0,1,0,3,"Braund, Mr. Owen Harris",22.0,7.25,0,1,0,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,71.2833,1,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,7.925,1,0,0,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,53.1,1,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,8.05,0,1,0,0,1,1


**Dropping Name and PassengerId as we have nothing to do with it**

In [0]:
train_df.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [0]:
test_df.drop(['PassengerId', 'Name'], axis=1, inplace=True)



In [0]:
avg = test_df['Fare'].astype('float').mean()

test_df['Fare'].replace(np.nan, avg, inplace=True)

In [0]:
train_df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,female,male,C,Q,S,Is_Alone
0,0,3,22.0,7.25,0,1,0,0,1,0
1,1,1,38.0,71.2833,1,0,1,0,0,0
2,1,3,26.0,7.925,1,0,0,0,1,1
3,1,1,35.0,53.1,1,0,0,0,1,0
4,0,3,35.0,8.05,0,1,0,0,1,1


In [0]:
train_df['Age*Class'] = train_df['Age'] * train_df['Pclass']
test_df['Age*Class'] = test_df['Age'] * test_df['Pclass']

In [0]:
test_df.head()

Unnamed: 0,Pclass,Age,Fare,female,male,C,Q,S,Is_Alone,Age*Class
0,3,34.5,7.8292,0,1,0,1,0,1,103.5
1,3,47.0,7.0,1,0,0,0,1,0,141.0
2,2,62.0,9.6875,0,1,0,1,0,1,124.0
3,3,27.0,8.6625,0,1,0,0,1,1,81.0
4,3,22.0,12.2875,1,0,0,0,1,0,66.0


In [0]:
train_df.to_csv('Clean_train.csv')
test_df.to_csv('Clean_test.csv')


In [0]:
!git init 

Reinitialized existing Git repository in /content/.git/


In [0]:
!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31m.config/[m
	[31msample_data/[m

nothing added to commit but untracked files present (use "git add" to track)


In [0]:
!git branch -s

error: unknown switch `s'
usage: git branch [<options>] [-r | -a] [--merged | --no-merged]
   or: git branch [<options>] [-l] [-f] <branch-name> [<start-point>]
   or: git branch [<options>] [-r] (-d | -D) <branch-name>...
   or: git branch [<options>] (-m | -M) [<old-branch>] <new-branch>
   or: git branch [<options>] (-c | -C) [<old-branch>] <new-branch>
   or: git branch [<options>] [-r | -a] [--points-at]
   or: git branch [<options>] [-r | -a] [--format]

Generic options
    -v, --verbose         show hash and subject, give twice for upstream branch
    -q, --quiet           suppress informational messages
    -t, --track           set up tracking mode (see git-pull(1))
    -u, --set-upstream-to <upstream>
                          change the upstream info
    --unset-upstream      Unset the upstream info
    --color[=<when>]      use colored output
    -r, --remotes         act on remote-tracking branches
    --contains <commit>   print only branches that contain the commit
    -

In [0]:
!git add -A

In [0]:
!git commit -m "Data Wrangling"


*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@e072e959d413.(none)')


In [0]:
!git config --global user.email "hibagenius.vs.genius468@gmail.com"

In [0]:
!git config --global user.name "hiba007"

In [0]:
!git commit -m "Data Wrangling"

[master (root-commit) 18da561] Data Wrangling
 16 files changed, 50663 insertions(+)
 create mode 100644 .config/.last_update_check.json
 create mode 100644 .config/active_config
 create mode 100644 .config/config_sentinel
 create mode 100644 .config/configurations/config_default
 create mode 100644 .config/gce
 create mode 100644 .config/logs/2019.02.15/17.21.27.847128.log
 create mode 100644 .config/logs/2019.02.15/17.21.31.764228.log
 create mode 100644 .config/logs/2019.02.15/17.21.39.299461.log
 create mode 100644 .config/logs/2019.02.15/17.21.41.846671.log
 create mode 100644 .config/logs/2019.02.15/17.21.42.616024.log
 create mode 100755 sample_data/README.md
 create mode 100755 sample_data/anscombe.json
 create mode 100644 sample_data/california_housing_test.csv
 create mode 100644 sample_data/california_housing_train.csv
 create mode 100644 sample_data/mnist_test.csv
 create mode 100644 sample_data/mnist_train_small.csv


In [0]:
!git status

On branch master
nothing to commit, working tree clean


In [0]:
!git remote add origin https://github.com/hiba007/Titanic-Analysis.git

fatal: remote origin already exists.


In [0]:
!sudo apt-get install xclip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  xclip
0 upgraded, 1 newly installed, 0 to remove and 8 not upgraded.
Need to get 17.5 kB of archives.
After this operation, 52.2 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 xclip amd64 0.12+svn84-4build1 [17.5 kB]
Fetched 17.5 kB in 0s (58.3 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package xclip.
(Reading database ... 131352 files and directories curren

In [0]:
!xclip -sel clip < ~/.ssh/id_rsa.pub

Error: Can't open display: (null)


In [0]:
!ssh-keygen -t rsa -b 4096 -C "hibagenius.vs.genius468@gmail.com"


Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa): 
/root/.ssh/id_rsa already exists.
Overwrite (y/n)? 


In [0]:
!eval 'ssh-agent -s'

SSH_AUTH_SOCK=/tmp/ssh-SEqYipxBzuhN/agent.436; export SSH_AUTH_SOCK;
SSH_AGENT_PID=437; export SSH_AGENT_PID;
echo Agent pid 437;


In [0]:
!ssh-add

Could not open a connection to your authentication agent.
