# Load Packages and Data

In [1]:
import pandas as pd
import numpy as np
import copy
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

df = pd.read_excel("/data/p_dsi/teams2023/team1/iphone_data.xlsx", index_col = 0)

# Feature Engineering

- Add a "day" column, from the existing ones
- Set "phone size" and "weeks_since_release" columns to have integer values
- Drop unneeded columns

In [6]:
df['day'] = pd.to_datetime(df['weeks_monday']).dt.day
df['phone size'] = df['phone size'].apply(lambda x: int(x.replace('gb', '')))
df['weeks_since_release'] = df['weeks_since_release'].apply(lambda x: int(x))

# Drop unneeded columns - identical over all rows, or correlated with other engineered columns
df.drop(['model', 'brand','release'], inplace=True, axis=1)

# Drop Outliers

Since the last week for which we have data has much smaller values than what is expected, we proceed with the assumption that we do not have data for the whole week, which is why that number is much lower. Therefore, we drop it to avoid this outlier affecting the models.

In [3]:
df = df[df['weeks_monday'] != '2023-02-13']

# Split Data

We use last 8 weeks for test data, which at the moment is only 3.5% of the data. We do this to make sure that we include iPhone 14 values in the training data, so the model is aware that that trend exists.

In [13]:
test_time = ['2023-02-06', '2023-01-30', '2023-01-23', '2023-01-16', '2023-01-09', '2023-01-02', '2022-12-26', '2022-12-19']

test_data = df.loc[df['weeks_monday'].isin(test_time)]
train_data = df.loc[~df['weeks_monday'].isin(test_time)]

In [19]:
# Sanity check that we only have the dates we wanted.
print("Dates in Test Data:", test_data['weeks_monday'].unique())
print("Size of Train Data:", len(train_data), "; Size of Test Data:", len(test_data))
print("Test Data is ", len(test_data)/len(train_data) * 100, "% of Train Data")

Dates in Test Data: ['2022-12-19T00:00:00.000000000' '2022-12-26T00:00:00.000000000'
 '2023-01-02T00:00:00.000000000' '2023-01-09T00:00:00.000000000'
 '2023-01-16T00:00:00.000000000' '2023-01-23T00:00:00.000000000'
 '2023-01-30T00:00:00.000000000' '2023-02-06T00:00:00.000000000']
Size of Train Data: 1205 ; Size of Test Data: 46
Test Data is  3.8174273858921164 % of Train Data


# Export Datasets

In [21]:
train_data.reset_index(drop=True)
train_data.to_excel("/data/p_dsi/teams2023/team1/train_data.xlsx")

test_data.reset_index(drop=True)
test_data.to_excel("/data/p_dsi/teams2023/team1/test_data.xlsx")