In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from sklearn.model_selection import GroupKFold,TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error

In [None]:
pd.options.display.max_columns = 50

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_set.csv')

In [None]:
for df in train,test:
    df['kickoff_time'] = pd.to_datetime(df['kickoff_time'])
    

In [None]:
train['name'].nunique()

In [None]:
to_shift_features = ('goals_scored','goals_conceded','assists','bonus'
                     ,'saves','own_goals','penalties_missed','penalties_saved',
                     'red_cards','clean_sheets','minutes','yellow_cards')


normal_features = (
    'kickoff_time','opponent_team','opp_team_name','round','value','was_home','name','position'
)

In [None]:
train.columns

In [None]:
all_feats = to_shift_features+normal_features

In [None]:
train

In [None]:

def apply_feature_engineering(df):
    df = df.copy()
    cv = GroupKFold(n_splits=df['name'].nunique())
    frames = []
    for _,data in cv.split(df,groups=df['name']):
        data = df.iloc[data]
        data.sort_values('kickoff_time')
        for feat in to_shift_features:
            data[feat] = data[feat].shift(1)

        frames.append(data)
    data = pd.concat(frames)
        
    return data

train = apply_feature_engineering(train)
test = apply_feature_engineering(test)


In [None]:
target = train['total_points']

In [None]:
train

In [None]:
le = LabelEncoder()

In [None]:
categorical = ['position','was_home']

In [None]:
for category in categorical:
    le.fit(train[category])
    for df in train,test:
        df[category] = le.transform(df[category])

In [None]:
train

In [None]:
train['Unnamed: 0']

In [None]:
name = test['name']

In [None]:
for df in train,test:
    df.set_index('kickoff_time',inplace=True)
    df.drop(columns=['Unnamed: 0','Unnamed: 0.1','name','season_x','opp_team_name',
                     'team_x','ict_index','influence','element','threat','transfers_in','transfers_out','value','creativity',
                     'team_a_score','team_h_score','transfers_balance','opponent_team'],inplace=True)

In [None]:
train

In [None]:
from lightgbm import LGBMRegressor

In [None]:
model = LGBMRegressor(random_state=42)

In [None]:
train['total_points']

In [None]:
cv = TimeSeriesSplit(n_splits=10)


metric = [] 
for train_v,test_v in cv.split(train):
    train_set = train.iloc[train_v]
    validation = train.iloc[test_v]
    model.fit(train_set.drop(columns='total_points'),train_set['total_points'])
    predictions = model.predict(validation.drop(columns='total_points'))
    metric.append(mean_squared_error(validation['total_points'],predictions))


In [None]:
metric

In [None]:
import numpy as np 
np.array(metric).mean()

In [None]:
train['total_points'].std()

In [None]:
model.fit(train.drop(columns='total_points'),train['total_points'])

pred = model.predict(test.drop(columns='total_points'))

In [None]:
mean_squared_error(test['total_points'],pred,squared=False)

In [None]:
test['total_points'].std()

In [None]:
name

In [None]:
test['name'] = name

In [None]:
test['name'] = list(name)

In [None]:
test[test['name'] == 'Mo Salah']

In [None]:
test['predictions'] = pred

In [None]:
test['year'] = test.index.dt.year

In [None]:
train['total_points'].max()

In [None]:
test['date'] = test.index

In [None]:
test['year'] = test['date'].dt.year

In [None]:
test['name']

In [None]:
season_data = test.groupby(['year','name'])[['total_points','predictions']].agg('sum')

In [None]:
season_data.sort_values('total_points',ascending=False)

In [None]:
season_data.sort_values('predictions',ascending=False)


In [None]:
imp = pd.DataFrame({
    'name':model.feature_name_,
    'importance':model.feature_importance
    
})