In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploration Data Analysis

At first using Seaborn's *histograms* to understand how data is distributed

In [None]:
# Importing Dataset
aircraft_df = pd.read_csv('/kaggle/input/investigating-aircraft-weight-and-balance-metrics/aircraft_dataset.csv')

# Printing a sample
aircraft_df.head()

In [None]:
# Analysing data
aircraft_df.describe()

In [None]:
# Ploting data for Analysis
import matplotlib.pyplot as plt 
import seaborn as sns

aircraft_features = aircraft_df.drop(['Unnamed: 0', 'Aircraft Model','Center of Gravity Limits', 'Engine Type'], axis=1)

aircraft_df[aircraft_features.columns].hist(figsize=(13, 12))

plt.tight_layout()
plt.show()

In [None]:
# Using Correlation to understand more about data
aircraft_features.corr()

In [None]:
sns.heatmap(aircraft_features.corr(), annot=True, fmt=".4f")

plt.title('Heatmap')
plt.show()

### Machine Learning
Starting using K-nn and analysing its precision

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X = aircraft_df.drop(['Unnamed: 0', 'Aircraft Model','Center of Gravity Limits', 'Engine Type'], axis=1)
y = aircraft_df['Aircraft Model']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

n_neighbors = 20

clf = KNeighborsClassifier(n_neighbors=n_neighbors)

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#### Using RandomForestClassifier with numeric features

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

clf_rfc = RandomForestClassifier(max_depth=20, random_state=2)

clf_rfc.fit(X_train, y_train)

In [None]:
y_predrfc = clf_rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_predrfc))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

##### *As visible, the results weren't satisfied. So, it'll need preprocessing methods for a better evaluation*

In [None]:
from sklearn import preprocessing



# Use data from X_train to create a Scaller
sscaler = preprocessing.StandardScaler().fit(X_train, y_train)

# Seeing if the Scaller was appropriately created
sscaler

In [None]:
X_scaled = sscaler.transform(X_train)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_train.columns)

X_scaled_df

In [None]:
# Re-calling clf_rfc, but removing the Max_Depth
clf_rfc = RandomForestClassifier(random_state=2)

clf_rfc.fit(X_scaled_df, y_train)

In [None]:
y_pred2 = clf_rfc.predict(X_test)

print(confusion_matrix(y_test, y_pred2))

In [None]:
print(classification_report(y_test, y_pred2))