In [None]:
# Importing a CSV without a header

import pandas as pd
url="LINK"
df=p.read_csv(url, header=none)

#Printing the data frame in Python

df # prints the entire dataframe (not recommended for large datasets)
df.head(n) #to show the first n rows of the data frame
df.tail(n) #shows the bottom n rows of data frame
df.head()

# Adding headers
# Replace deafault headers (by df.columns=headers)

headers=["","","etc"]
df.columns=headers
df.head(5)

## Exporting to different formats in Python

Data Format | Read | Save  
:---: | :---: | :---: |  
csv | pd.read_csv() | df.tp_csv()  
json | pd.read_json() | df.to_json()  
excel | pd.read_excel() | df.to_excel()  
sql | pd.read_sql() | df.to_sql()

In [None]:
# In pandas to check data types

df.dtypes

# Returns a statistical summary

df.describe()

# Provide full summary statistics

df.describe(include="all")

# Now, the outcome shows the summary, including: object typed, unique, top, & frequency
# Unique is the number of distinct objects in the column
# Top is most frequently occurring object
# Freq is the number of times the top object appears in the column
# Some values in the table are shown here as NaN which stands for "Not a Number"

df.info()

# shows the top 30 rows and bottom 30 rows of the data frame

In [None]:
# drop missing values along the column "price"

df.dropna(subset=["price"], axis=0)

# You can select the columns of a data frame by indicating the name of  each column

dataframe[['column 1','column 2','column 3']]

# you can apply the method  ".describe()" to get the statistics of those columns as follows:

dataframe[['column 1','column 2','column 3']].describe()

In [None]:
#  Data preprocessing is often called data cleaning or data wrangling

# How to drop missing values in Python

dataframes.dropna()

# axis=0 drops the entire row
# axis=1 drops the entire column

df.dropna(subset=["price"], axis=0, inplace=True)
# Setting the argument in place to true, allows the modification to be done on the data set directly

# How to replace missing values in Python

dataframe.replace(missing_value,new_value)

# replace it with an average

mean=df["normalized-losses"].mean()
df["normalized-losses"].replace(np.nan,mean)

In [None]:
# Data Formatting in Python

df["city-mpg"]=235/df["city-mpg"]
df.rename(columns={"city_mpg":"city-L/100km"},inplace=True)

# Correcting data types

# To identify data types:

dataframe.dtypes()

# To convert data types:

dataframe.astype()

# Example: convert data type to integer in column "price"

df["price"]=df["price"].astype("int")

## Methods of normalizing data

###### length es el nombre de la variable/columna que se desea normalizar

###### simple feature scaling just divides each value by the maximum value for that feature. This makes the new values range between zero and one.

df["length"]=df["length"]/df["length"].max()

###### min-max takes each value X_old subtract it from the minimum value of that feature, then divides by the range of that feature. Again, the resulting new values range between zero and one.

df["length"]=(df["length"]-df["length"].min())/(df["length"].max()-df["length"].min())

###### z-score or standard score. In this formula for each value you subtract the mu which is the average of the feature, and then divide by the standard deviation sigma. The resulting values hover around zero, and typically range between negative three and positive three but can be higher or lower.

df["length"]=(df["length"]-df["length"].mean())/df["length"].std()

In [None]:
# Bins = range by groups

bins=np.linspace(min(df["price"]),mad(df["price"]),4)

group_names=["Low","Medium","High"]

df["price-binned"]=pd.cut(df["price"],bins,labels=group_names,include_lowest=true)

# Dummy variables in Python pandas also called one-hot encoding

pd.get_dummies(df["fuel"])

<b>How to work with missing data?</b>

Steps for working with missing data:
<ol>
    <li>dentify missing data</li>
    <li>deal with missing data</li>
    <li>correct data format</li>
</ol>

<h2 id="identify_handle_missing_values">Identify and handle missing values</h2>


<h3 id="identify_missing_values">Identify missing values</h3>
<h4>Convert "?" to NaN</h4>
In the car dataset, missing data comes with the question mark "?".
We replace "?" with NaN (Not a Number), which is Python's default missing value marker, for reasons of computational speed and convenience. Here we use the function: 
 <pre>.replace(A, B, inplace = True) </pre>
to replace A by B

In [None]:
import numpy as np

# replace "?" to NaN
df.replace("?", np.nan, inplace = True)
df.head(5)

In [None]:
# True = missing data

missing_data = df.isnull()
missing_data.head(5)

# Count missing data

for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

<h3 id="deal_missing_values">Deal with missing data</h3>
<b>How to deal with missing data?</b>

<ol>
    <li>drop data<br>
        a. drop the whole row<br>
        b. drop the whole column
    </li>
    <li>replace data<br>
        a. replace it by mean<br>
        b. replace it by frequency<br>
        c. replace it based on other functions
    </li>
</ol>

In [None]:
# Descriptive Statistics

# summerize the categorical data is by using the value_counts() method

drive_wheels_counts=df['drive-wheels'].value_counts()

drive_wheels_counts.rename(columns={'drive-wheels':'value_counts'},inplace=True)
drive_wheels_counts.index.name='drive-wheels'

Box plots:
- The main features that the box plot shows are the median of the data, which represents where the middle data point is.
- The upper quartile shows where the 75th percentile is.
- The lower quartile shows where the 25th percentile is.
- The data between the upper and lower quartile represents the interquartile range.
- Next, you have the lower and upper extremes. These are calculated as 1.5 times the interquartile range above the 75th percentile and as 1.5 times the IQR below the 25th percentile.
- Finally, box plots also display outliers as individual dots that occur outside the upper and lower extremes.

1. The **predictor/independent** variable is the variable that you are using to predict an outcome.
2. The **target/dependent** variable is the variable that you are trying to predict.
3. In a scatter plot, we typically set the **predictor** variable on the **x-axis** or **horizontal axis**, and we set the **target** variable on the **y-axis** or **vertical axis**.

In [None]:
# Scatterplot example

y=df['engine-size']
x=df['price']
plt.scatter(x,y)

plt.title('Scatterplot of Engine Size vs Price')
plt.ylabel('Engine Size')
plt.xlabel('Price')

# GroupBy() example

df_test=df['drive-wheels','body-style','price']
df_grp=df_test.groupby(['drive-wheels','body-style'],as_index=False).mean()
df_grp

# Pivot Table

df_pivot=df_grp.pivot(index='drive-wheels',columns='body-style')

# Heatmap: plot target variable over multiple variables

plt.color(df_pivot,cmap='RdBBu')
plt.colorbar()
plt.show()

# Plot correlation

sns.regplot(x='engin-size',y='prices',data=df)
plt.ylim(0,)

# Pearson correlation example

pearson_coef,p_value=stats.personr[['hoursepower'],df['price']]

# ANOVA betwenn Honda & Subaru

df_anova=df[['make','price']]
grouped_anova=df_anova.groupby(['make'])

F=0.19
pvalue=0.66

# ANOVA betwenn Honda & Jaguar

F=400.92
pvalue=1.05e-11

# we can say that there's a strong correlation between a categorical variable and other variables
# if the ANOVA test gives us a large F-test value and a small p-value

# Fitting a Simple Linear Model Estimator

X: Predictor variable
y: Target variable

# Import linear_model from scikit-learn
from sklearn.linear_model import LinearRegression

# Create a Linear Regression Object using the constructor
lm=LinearRegression()

# Fitting a Simple Linear Model
# We define the predictor variable and the target variable
x=df[['highway-mpg']]
y=df['price']

# Then fit the model
lm.fit(x,y)

#We can obtain a prediction
yhat=lm.predict(x)

# Fitting a Multiple Linear Model Estimator

# We can extract the 4 predictor variables and store them in the variable Z
z=df[['horsepower','curb-weight','engine-size','highway-mpg']]

# Then train the model as before
lm.fit(z,df['price'])

# Obtain the prediction
yhat=lm.predict(x)

# Find the intercept (b0)
lm.intercept_

# Find the coeficients (b1,b2,b3,b4)
lm.coef_

# The Estimated Linear Model

y = b0 + b1x1 + b2x2 + b3x3 + b4x4

Price = -15678.74 + (52.66)*horsepower + (4.70)*curb-weight + (81.96)*engine_size + (33.58)*highway-mpg

# Regression Plot

import seaborn as sns

sns.regplot(x='highway-mpg',y='price',data=df)
plt.ylim(0,)

# Residual Plot

import seaborn as sns

sns.residplot(df['highway-mpg'],df['price'])

# Distribution Plots

import seaborn as sns

axl=sns.distplot(df['price'],hist=False,color='r',label="Actual Value")

sns.distplot(Yhat,hist=False,color="b",label="Fitted Values",ax=axl)

# Polynomial Regression

# Quadratic - 2ยบ order (Degree 2)
y = b01 + b1x1 + b2(x1)^2

#Cubic - 3ยบ order (Degree 3)
y = b01 + b1x1 + b2(x1)^2 + b3(x1)^3

# Higher order (more Degrees)
y = b01 + b1x1 + b2(x1)^2 + b3(x1)^3 + ...

#Calculate Polynomial of 3ยบ order

f=np.polyfit(x,y,3)
p=np.polyfit(f)
print(p)

# Polynomial Regression with more than one dimension

from sklearn.preprocessing import PolynomialFeatures
pr=PolynomialFeatures(degree=2)
x_polly=pr.fit_transform(x['horsepower','curb-weight'],include_bias=False)

# Pre-processing
# We can Normalize the each feature simultaneously

from sklearn.preprocessing import StandardScaler
SCALE=StandardScaler()
SCALE.fit(x_data[['horsepower','highway-mpg']])
x_scale=SCALE.transform(x_data[['horsepower','highway-mpg']])

# Pipelines

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandarScaler
from sklearn.pipeline import Pipeline

# Pipeline Constructor

Input=[('scale',StandarScaler()),('polynomial',PolynomialFeatures(degree=2),('mode',LinearRegression())]
pipe=Pipeline(Input)

# We can train the pipeline object

Pipe.train(X['horsepower','curb-weight','engine-size','highway-mpg'],y)
yhat=Pipe.predict(X[['horsepower','curb-weight','engine-size','highway-mpg']])

# Measures for In-Sample Evaluation

# Mean Square Error (MSE)

from sklearn.metrics import mean_squared_error
mean_squared_error(df['price'],Y_predict_simple_fit)

# R-squared

x=df[['highway-mpg']]
y=df['price']

lm.fit(x,y)
lm.score(x,y)

# we train the model

lm.fit(df['highway-mpg'],df['prices'])

# Predict the price of a car with 30 highway-mpg

lm.predict(30)

# We use the numpy function arrange to generate a sequence from 1 to 100

import numpy as np
new_input=np.arange(1,101,1).reshape(-1,1)

# The first parameter is the starting point of the sequence = 1
# The second parameter is the endpoint plus one of the sequence = 101
# The final parameter is the step size between elements in the sequence = 1

# We can predict new values

yhat=lm.predict(new_input)

In [None]:
# Model Evaluation and Refinement

# Split data into random train and test subsets

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.3,random_state=0)

# Donde
# x_data: features or independent variables
# y_data: dataset target: df['price']
# x_train & y_train: parts of available data as training set
# y_test & y_test: parts of available data as testing set
# test_size: percentage of the data for testing (here 30%)

# Cross Validation

from sklearn.model_selection import cross_val_score

scores=cross_val_score(lr,x_data,y_data,cv=3)

np.mean(scores)

# The first input parameters:
# the type of model we are using to do the cross-validation: In this example, we initialize a linear regression model or lr
# x_data: the predictive variable data
# y_data: the target variable data
# cv: the number of partitions with the cv parameter. Here, cv=3, which means the data set is split into 3 equal partitions
# The function returns an array of scores, one for each partition that was chosen as the testing set
# We can average the result together to estimate out of sample r squared using the mean function NnumPi.

# Cross Validation Prediction
# it returns the prediction that was obtained for each element when it was in the test set

from sklearn.model_selection import cross_val_predict
yhat=cross_val_predict(lr2e,x_data,y_data,cv=3)

# Overfitting, Underfitting and Model Selection

Rsqu_test=[]
order=[1,2,3,4]
for n in order:
pr=PolynomialFeatures(degree=n)
x_train_pr=pr.fit_transform(x_train[['horsepower']])
x_test_pr=pr.fit_transform(x_test[['horsepower']])
lr.fit(x_train_pr,y_train)
Rsqu_test.append(lr.score(x_test_pr,y_test))

# Ridge Regression

from sklearn.linear_model import Ridge
RidgeModel=Ridge(alpha=0.1)
RodgeModel.fit(x,y)
yhat=RidgeModel.predict(x)

*As alpha increases the parameters get smaller*\
This is most evident for the higher order polynomial features. But Alpha must be selected carefully.\
If alpha is too large, the coefficients will approach zero and **underfit** the data.\
If alpha is zero, the **overfitting** is evident.

Example:
- Alpha = 0.001, the overfitting begins to subside.
- Alpha = 0.01, the estimated function tracks the actual function.
- Alpha = 1, we see the first signs of **underfitting**
- Alpha = 10, we see extreme **underfitting**

*Conversely, as alpha increases, the R-squared on the test data decreases.
This is because the term Alpha prevents overfitting.
This may improve the results in the unseen data, but the model has worse performance on the test data.*

## Grid search

The term Alpha in Ridge regression is called hyperparameter
Scikit-learn has a means of automatically iterating over these hyperparameters using cross-validation called: **Grid Search**

*What data do we use to pick the best hyperparameter?*
Response: Validation Data

The value of your Grid Search is a Python list that contains a Python dictionary.
The key is the name of the free parameter ('alpha').
The value of the dictionary is the different values of the free parameter (1,10,100,1000).

parameters=[{'alpha':[1,10,100,1000]}]

In [None]:
# Grid Search

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
parameters1=[{'alpha':[0.001,0.1,1,10,100,1000,10000,100000,100000]}]
RR=Ridge()
Grid1=GridSearchCV(RR,parameters1,cv=4)
Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)
Grid1.best_estimator_
scores=Grid1.cv_results_
scores['mean_test_score']

In [None]:
# Ridge regression to normalize data

parameters=[{'alpha':[1,10,100,1000],'normalize':[True,False]}]
Ridge()

In [None]:
# Grid Search with normalized data

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
parameters2=[{'alpha':[1,10,100,1000],'normalize':[True,False]}]
RR=Ridge()
Grid1=GridSearchCV(RR,parameters2,cv=4)
Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)
Grid1.best_estimator_
scores=Grid1.cv_results_

In [None]:
# We can print out the score for the different free parameter values

for param,mean_val,mean_test inzip(scores['params'],scores['mean_test_score'],scores['mean_train_score']):
print(param,'R^2 on test data:', mean_val,'R^2 on train data:', mean_test)