In [1]:
# the idea of this script, is to replicate the final csv that we craft during the exploratory analysis, in order to feed the BI.

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def create_model(x, y):
    '''
    returns a linear regression object, 
    trained with flight delay data
    '''
    lin = LinearRegression()
    lin.fit(x, y)
    return lin


def create_dataframe(df):
    # filling na values
    df_wo_na = df.copy().dropna()
    x=df_wo_na["Departure Delay in Minutes"].values.reshape((-1, 1))
    y=df_wo_na["Arrival Delay in Minutes"]
    lin = create_model(x, y)
    df['Arrival Delay in Minutes'].fillna( lin.coef_[0] * df['Departure Delay in Minutes'] + lin.intercept_, inplace=True)
    
    # avg of satisfaction score
    df['satisfaction_avg'] = round((df['Inflight wifi service'] + df['Departure/Arrival time convenient'] + df['Ease of Online booking'] +
                        df['Gate location'] + df['Food and drink'] + df['Online boarding'] + df['Seat comfort'] + df['Inflight entertainment'] + 
                        df['On-board service'] + df['Leg room service'] + df['Baggage handling'] + df['Checkin service'] +df['Inflight service'] + df['Cleanliness']) /14,2)

    # make the total delay relative to flight distance
    df['total_delay over distance'] = (df['Departure Delay in Minutes'] + df['Arrival Delay in Minutes']) / df['Flight Distance']

    # make bins of age
    df['age_range'] = df.Age.apply(lambda x: '<20' if x<20 else ('20-30' if x <30 else ('30-40' if x <40 else ('40-50' if x<50 else '>=50'))))
    df.drop(columns='Unnamed: 0', inplace=True)
    return df


df = pd.read_csv('cuestionario_satisfaccion.csv')
df_clean = create_dataframe(df)
df_clean.to_excel('clean_survey.xlsx')

In [13]:
def generate_correlation_data(df):
    '''generates a dataset with the correlation index 
    between satisfaction and features'''
    df['satisfaction_bool'] = df.satisfaction.apply(lambda x: 1 if x=='satisfied' else 0)
    correlation = df.corr()[['satisfaction_bool']]
    return correlation

corr_df = pd.DataFrame(generate_correlation_data(df).iloc[:-1])
corr_df.to_excel('correlation.xlsx')