# Model the price of spaceflights using linear regression and find score on test set

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

## Load data

In [2]:
data_input = "data/intermediate/"
master_table = pd.read_csv(data_input + "master_table.csv")

## Parameters

In [3]:
features = ["engines", "passenger_capacity", "crew", "d_check_complete", "moon_clearance_complete", "iata_approved", "company_rating", "review_scores_rating"]
test_size = 0.2
random_state = 3

## Main functions to run

In [5]:
def split_data(data, features, test_size, random_state):
    X = data[features]
    y = data["price"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test    

def train_model(X_train, y_train):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return regressor

def evaluate_model(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    score = r2_score(y_test, y_pred)
    print(f"Model has a coefficient R^2 of {score:.3} on test data.")

## Run

In [8]:
X_train, X_test, y_train, y_test = split_data(master_table, features, test_size, random_state)
regressor = train_model(X_train, y_train)
evaluate_model(regressor, X_test, y_test)

Model has a coefficient R^2 of 0.462 on test data.


## Save model

In [9]:
import pickle

filehandler = open("regressor.p", "wb")
pickle.dump(regressor,filehandler)
filehandler.close()