In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train=pd.read_csv('./../input/spaceship-titanic/train.csv')
test=pd.read_csv('./../input/spaceship-titanic/test.csv')
test1=test.copy()
train

In [None]:
test1.head()

## **Finding the Info about Data**

In [None]:
train.info()

## **Finding the Unique Values in each Column** 

In [None]:
train.nunique()

## **Finding Null Values in each Column**

In [None]:
train.isnull().sum()

## **Drop some useless columns**

In [None]:
train=train.drop(columns=['PassengerId','Name'], axis=1)
test=test.drop(columns=['PassengerId','Name'], axis=1)
# test
train

## **Unique Values in a particular column**

In [None]:
train['HomePlanet'].unique()

In [None]:
train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#BB0000')\
                     .bar(subset=["mean",], color='green')

## **Heatmap**

In [None]:
import seaborn as sns
sns.heatmap(train.corr(),cmap='rainbow',annot=True)

## **Countplot**

In [None]:
sns.countplot(x='HomePlanet',hue="Transported",data=train,palette = "Dark2")

## **Seperate Feature w.r.t Datatypes**

In [None]:
num_train = train.select_dtypes(include='number')
cat_train = train.select_dtypes(exclude='number')
# num_train
cat_train

In [None]:
num_test = test.select_dtypes(include='number')
cat_test = test.select_dtypes(exclude='number')
num_test

## **Encoding**

In [None]:
# Label Encoding - label encoder to each column with categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in cat_train:
    train[col]=le.fit_transform(train[col])
train

In [None]:
for col in cat_test:
    test[col]=le.fit_transform(test[col])
test

## **Finding the following Features (to be treated seperately later)**
* Continuous Features
* Discreet Features


In [None]:
con_train =[col for col in num_train if train[col].nunique()>25]
dis_train =[col for col in num_train if train[col].nunique()<25]
con_train

In [None]:
con_test =[col for col in num_test if test[col].nunique()>25]
dis_test =[col for col in num_test if test[col].nunique()<25]
con_test

## **Imputing**

Missing values are one of the most common problems you can encounter when you try to prepareyour data for machine learning. The reason for the missing values might be human errors,interruptions in the data flow, privacy concerns, and so on. Whatever is the reason, missing values affect the performance of the machine learning models.

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='mean')  # Will replace MISSING NUMERIC values with MEAN
# csi = SimpleImputer(strategy='most_frequent')  # Will replace MISSING CATEGORICAL values with MOST FREQUENT value

train=pd.DataFrame(imp.fit_transform(train))

train

In [None]:
test = pd.DataFrame(imp.fit_transform(test))
test

## **Standardizing - Discrete Values**

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).


In [None]:
# RobustScaler
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()

for col in dis_train:
    train[col]= rs.fit_transform(train[col])
train

In [None]:
for col in dis_test:
    test[col]= rs.fit_transform(test[col])
test

## **Removing Outliers**

In statistics, an outlier is a data point that differs significantly from other observations. An outlier may be due to variability in the measurement or it may indicate experimental error; the latter are sometimes excluded from the data set. An outlier can cause serious problems in statistical analyses.

In [None]:
plt.boxplot(train,vert=True,patch_artist=True)

In [None]:
# Using Isolation Forest
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.3)

out = iso.fit_predict(train)

# select all rows that are not outliers
train[out != -1]
train

In [None]:
out = iso.fit_predict(test)

# select all rows that are not outliers
test[out != -1]
test

## **Splitting x & y**

In [None]:
y=train.iloc[:,-1]
X=train.iloc[::,:-1]
# y
X

## **Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train

## **Train**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(max_depth=20,oob_score=True)

model.fit(X_train, y_train)

## **Predict**

In [None]:
predict= model.predict(X_test)
predict

## **Score**

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
cm = confusion_matrix(y_test, predict, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot()

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, predict)

In [None]:
sub=np.where(model.predict(test)==0.0,False,True) 
    
submission=pd.DataFrame({'PassengerId': test1.PassengerId,
                         'Transported' : sub
                        })
submission.to_csv('submission.csv', index=False)

## **Suggestions:-**
* Kaggle - https://www.kaggle.com/pythonkumar
* GitHub - https://github.com/KumarPython​
* Twitter - https://twitter.com/KumarPython
* LinkedIn - https://www.linkedin.com/in/kumarpython/
