# Data Preparation

During today and following days, we will have a set of exercises about the data with UFO sightings around the world. At the end, we will create a machine learning model that will predict how long we can see the UFO based on the characteristics of a sighting.

Now, in this file, we are going to focus on data preparation. Try to replicate the steps from the tutorial, become familiar with the information and variables in the data and prepare it for the next phase, feature engineering.

You can download the data from [**here**](https://drive.google.com/open?id=0B2gZvn36c5CmRTJpS3pkUllmX1U).

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/Users/jurajkapasny/Downloads/UFO_data.csv',sep=';')

## Exercise

In [None]:
df.head()

In [None]:
# shape
print(df.shape)

In [None]:
# datatypes
print(df.dtypes)

In [None]:
numeric_columns = df.dtypes[df.dtypes == "float"].index.tolist()

In [None]:
# drop useless column
df.drop('Unnamed: 0', axis =1, inplace=True)

In [None]:
# check missing values
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
# drop columns with many nans
drop_columns = missing_data[missing_data['Percent'] > 0.8].index.tolist()
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
# fill nan with mean
df['pressure'].fillna(df['pressure'].mean(), inplace=True)
# fill nan with median
df['apparentTemperature'].fillna(df['apparentTemperature'].median(), inplace=True)

In [None]:
# fill nan with text (categorical variable)
df['summary'].fillna('Not available', inplace= True)

In [None]:
# descriptive statistics
df['duration_seconds'].describe()

In [None]:
# histogram
plt.figure(figsize=(16, 6))
sns.distplot(df['duration_seconds'])
plt.show()

In [None]:
# histogram (withou extreme values)
plt.figure(figsize=(16, 6))
sns.distplot(df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.1),
                                               df['duration_seconds'].quantile(0.9))]['duration_seconds'])
plt.show()

In [None]:
#scatter plot cloudCover vs. duration_seconds
fix,ax = plt.subplots(figsize=(16, 6))
df_lim = df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.0),
                                               df['duration_seconds'].quantile(0.9))]
ax.scatter(df_lim['precipIntensity'],df_lim['duration_seconds'])
ax.set_xlabel('precipIntensity')
ax.set_ylabel('duration')
# ax.set_ylim((0,800000))
plt.show()

In [None]:
#scatter plot cloudCover vs. duration_seconds
fix,ax = plt.subplots(figsize=(16, 6))
df_lim = df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.0),
                                               df['duration_seconds'].quantile(0.9))]
ax.scatter(df_lim['cloudCover'],df_lim['duration_seconds'])
ax.set_xlabel('cloudCover')
ax.set_ylabel('duration')
# ax.set_ylim((0,800000))
plt.show()

In [None]:
# categorical variable analysis
# duration in seconds - country relationship 
fig, ax = plt.subplots(figsize=(14, 8))
fig = sns.boxplot(x='country', y="duration_seconds", data=df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.1),
                                                                                            df['duration_seconds'].quantile(0.9))])
plt.show()

In [None]:
# categorical variable analysis
# duration in seconds - state relationship 
fig, ax = plt.subplots(figsize=(14, 8))
fig = sns.boxplot(x='state', y="duration_seconds", data=df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.1),
                                                                                            df['duration_seconds'].quantile(0.9))])
plt.show()

In [None]:
# categorical variable analysis
# duration in seconds - shape
fig, ax = plt.subplots(figsize=(18, 8))
fig = sns.boxplot(x='shape', y="duration_seconds", data=df[df['duration_seconds'].between(df['duration_seconds'].quantile(0.1),
                                                                                            df['duration_seconds'].quantile(0.9))])
plt.show()

In [None]:
# correlation between numeric variables
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True,cmap="RdYlGn_r")
plt.show()

In [None]:
# correlation with duration_seconds
corrmat = df.corr()

In [None]:
corrmat[["duration_seconds"]].sort_values("duration_seconds").head()

In [None]:
# Variable transformation

In [None]:
cols = ["dewPoint",'humidity',"temperature","windSpeed"]
for cl in cols:
    # histogram (withou extreme values)
    plt.figure(figsize=(16, 6))
    sns.distplot(df[cl])
    plt.show()

In [None]:
df["windSpeed_log"] = df["windSpeed"].apply(np.log)
df["humidity_2"] = df["humidity"]*df["humidity"]
# we use square transformation to get rid of the left skewness in the variable. we need to shift the distribution to only positive numbers before.
df["temperature_2"] = ((df["temperature"]+ df["temperature"].min()+1)**2).apply(np.log)
df["dewPoint_2"] = ((df["dewPoint"]+ df["dewPoint"].min()+1) **2).apply(np.log)

# Feature Engineering

Continue where you left during the data preparation exercise. Focus on the different techniques of feature engineering we read about today and try to come up with some features that could be relevant later on.

In [None]:
# new time variables
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour
df["day_of_the_week"] = df["timestamp"].dt.dayofweek
df["month"] = df["timestamp"].dt.month

df["time_of_the_day"] = "night"
df.loc[df["hour"].between(6,11), "time_of_the_day"] = "morning"
df.loc[df["hour"].between(12,17), "time_of_the_day"] = "afternoon"
df.loc[df["hour"].between(18,21), "time_of_the_day"] = "evening"

df["season"] = "winter"
df.loc[df["month"].between(3,5), "season"] = "spring"
df.loc[df["month"].between(6,8), "season"] = "summer"
df.loc[df["month"].between(9,11), "season"] = "autumn"

In [None]:
# polynomial expansion
# we can apply polynomial expension which was mentioned during the day
from sklearn.preprocessing import PolynomialFeatures

# we will create polynomial of second order and use only two features
poly = PolynomialFeatures(2)
poly_data = poly.fit_transform(df[["apparentTemperature","pressure"]])

In [None]:
print(poly_data.shape)

In [None]:
# we generated 6 features, now back to pandas:
df_poly_data = pd.DataFrame(poly_data)
df_poly_data.columns = ["poly_feat_"+str(i) for i in range(poly_data.shape[1])]

In [None]:
df = df.merge(df_poly_data, left_index=True, right_index=True)

In [None]:
# dummy variables
df_dummy = pd.get_dummies(df[["country","shape"]])
df_dummy.shape