In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
import seaborn as sns

df = pd.read_csv('airline_10.csv')

#EDA
df.head() #to columns/column names/what the data looks like
df.columns #to list column names
df.tail()
df.describe()  #to see summary stats
df.shape #to see how many rows and columns the df has (12309 rows, 15 columns)
df.dtypes # to check out the data types for each column

#to check for missing data:
df.isnull().any()
#BulkFare has missing data - will be dropped anyways. All other columns show false, so no other missing data 

df.corr()  #correlation matrix to start looking at relationships between columns

#maybe drop some of the columns I'm not going to be looking at...
df.drop(['Unnamed: 0', 'ItinID'], axis=1, inplace=True)

#looking at relationships between fares and origin
fares = pd.pivot_table(df, index= 'Origin', values='ItinFare', aggfunc= np.mean)
fares

#looking at relationships between fares, origins, carriers and distance
fares2 = pd.pivot_table(df, index= ['Origin', 'RPCarrier', 'FarePerMile', 'Distance'], values='ItinFare', aggfunc=np.mean)
fares2

#looking at the summary stats for the existing fare data
df['ItinFare'].describe()


#other columns to drop 
df.drop(['Unnamed: 0', 'ItinID','OriginState', 'FarePerMile', 'BulkFare'], axis=1, inplace=True)

df.head()

#establishing the y value - using ItinFare as the target since we want to predict the fares of flights
y = df['ItinFare']
y

#here df2 is synonymous with x. I created df2 because I wanted to include all of the features from the columns that haven't been dropped
df2 = df.drop(['ItinFare'], axis=1, inplace=True)

df2.head()

df2.shape

df2.count()

#run the train/test/split with df2 (synonymous with x in this case) and y (the target). I chose a test size of 0.4, meaning 40% of the data will be set aside to test on later.
x_train, x_test, y_train, y_test = train_test_split(df2, y, test_size=0.4)
print x_train.shape, y_train.shape
print x_test.shape, y_test.shape

x_train.head()

lm = linear_model.LinearRegression()   #instantiating the model - basically just creating a model object in python to put features into
lm

model = lm.fit(x_train, y_train) #fitting the data - putting in the x and y training data

model.coef_   #find the coefficients for each feature - these show the impact that each feature has on y - both the direction and magnitude of influence on y.

model.intercept_ #find the y intercept (when x = 0, what does y look like?)

lm.score(x_train, y_train) #get the score for the x and y training data to see how well it follows the line of best fit

predictions = lm.predict(x_test) # this gives the predictions for flight fares with the model

predictions[0:5]

plt.scatter(y_test, predictions)
plt.xlabel('Actual Values')
plt.ylabel('Predictions')

print "Score:", model.score(x_test, y_test) #the x and y test data is fed into the model and scored.  we compare the test and train score to check for overfitting

model.score

from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics

#perform 3-fold cross validation
scores = cross_val_score(lm, df2, y, cv=3)
print "Cross-Validated scores:", scores     # model is validated with all of the data. this score is the average of the values computed in the loop. 


#the scores I got are not particularly great (hovering around 0.14), but I could continue to play with the features that were included to see if there are better combinations (ie. what should be omitted, what hyperparameters could be used like changing alpha - which I need to review further)

















Origin  RPCarrier  FarePerMile  Distance
MDW     9E         0.0798       2694.0      213.0
                   0.1967       717.0       141.0
                   0.2180       2170.0      473.0
                   0.2378       698.0       166.0
                   0.2631       1414.0      372.0
                   0.2751       1414.0      389.0
                   0.3575       456.0       163.0
                   0.4534       1546.0      701.0
                   0.5263       456.0       240.0
                   0.5658       456.0       258.0
                   0.6819       349.0       238.0
                   0.7221       349.0       252.0
        CP         0.0657       1659.0      109.0
                   0.0764       1884.0      144.0
                   0.0818       2654.0      217.0
                   0.0876       3768.0      330.0
                   0.0987       2523.0      249.0
                   0.1036       4285.0      444.0
                   0.1205       5736.0      691.0
         