In [43]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import sklearn as skl
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import ensemble

import xgboost as xgb
from xgboost import XGBClassifier


In [54]:
df = pd.read_excel('eurovision_song_contest_1975_2022.xlsx')

In [55]:
#Removing unwanted observations and columns
df = df.drop(df.index[df['Duplicate'] == 'x'])
df = df.drop(['Edition', 'Duplicate',], axis = 1)
df = df.drop(df.index[df['Year'] < 2008]) #Dropping the years before 2008
#Replacing Country names
df = df.replace(to_replace ="F.Y.R. Macedonia",
                 value ="North Macedonia")
df = df.replace(to_replace ="Macedonia",
                 value ="North Macedonia")
df = df.replace(to_replace ="Netherlands",
                 value ="The Netherlands")

#Removing 0 values, and semi-final observations and Jury votes
df = df.drop(df.index[df['Points      '] == 0])
df = df.drop(df.index[df['(semi-) final']=='sf1'])
df = df.drop(df.index[df['(semi-) final']=='sf2'])
df = df.drop(df.index[df['Jury or Televoting']=='J'])
df = df.reset_index()


#Encoding string class values as integers
le = preprocessing.LabelEncoder()
df1=df
df1['(semi-) final'] = le.fit_transform(df1['(semi-) final'].values)
df1['Jury or Televoting'] = le.fit_transform(df1['Jury or Televoting'].values)
df1['From country'] = le.fit_transform(df1['From country'].values)
df1['To country'] = le.fit_transform(df1['To country'].values)

### Gradient Boosting Classifier Method

Prediction by using the points as classes.

In [56]:
#We use the data from years 2008 to 2019, 2021 as train data and the data from 2022 as test data. 
#Categorizing the points into classes
df1['Points      '] = pd.Categorical(df1['Points      '])
#Train set
train_set = df1[df1['Year'] < 2022]
y_train = train_set['Points      ']
train_set.drop(labels='Points      ', axis=1, inplace=True)

#Test set
test_set = df1[df1['Year'] == 2022]
y_test = test_set['Points      ']
test_set.drop(labels='Points      ', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.drop(labels='Points      ', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.drop(labels='Points      ', axis=1, inplace=True)


In [57]:
#Setting different learning Rates in order to see which performs the best
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators = 20, learning_rate = learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(train_set, y_train)
    
    print("Learning rate: {0:.3f}", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(train_set, y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb_clf.score(test_set, y_test)))

Learning rate: {0:.3f} 0.05
Accuracy score (training): 0.189
Accuracy score (test): 0.100
Learning rate: {0:.3f} 0.075
Accuracy score (training): 0.188
Accuracy score (test): 0.100
Learning rate: {0:.3f} 0.1
Accuracy score (training): 0.205
Accuracy score (test): 0.100
Learning rate: {0:.3f} 0.25
Accuracy score (training): 0.232
Accuracy score (test): 0.090
Learning rate: {0:.3f} 0.5
Accuracy score (training): 0.249
Accuracy score (test): 0.100
Learning rate: {0:.3f} 0.75
Accuracy score (training): 0.261
Accuracy score (test): 0.100
Learning rate: {0:.3f} 1
Accuracy score (training): 0.241
Accuracy score (test): 0.103


In [59]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=2)
gb_clf2.fit(train_set, y_train)
predictions = gb_clf2.predict(test_set)
predictions

#print("Confusion Matrix:")
#print(confusion_matrix(y_test, predictions))

#print("Classification Report")
#print(classification_report(y_test, predictions))

array([10, 10, 10, 12, 12,  2, 12,  2,  4,  4, 10, 10, 12, 12, 12, 12,  2,
        2,  4,  4,  2,  2,  2,  2,  2,  2,  2,  2,  4,  4, 10,  2,  2,  2,
        2,  2,  2,  2,  4,  4, 10, 10,  2,  2,  2,  2,  2,  2,  4,  4,  2,
        2,  2,  2,  2,  2,  2,  2,  4,  4, 10, 10,  2,  2,  2,  2,  2,  2,
        4,  4, 10, 10,  2,  2,  2,  2,  2,  2,  2,  4, 10, 10,  2,  2,  2,
        2,  2,  2,  4,  4, 10, 10,  2,  2,  2,  2,  2,  2,  4,  4, 10,  2,
        2,  2,  2,  2,  2,  2,  4,  4, 10, 10,  2,  2,  2,  2,  2,  2,  4,
        4, 10, 10,  2,  2,  2,  2,  2,  2,  4,  4,  2,  2,  2,  2,  2,  2,
        2,  2,  4,  4,  2,  3, 10, 10,  2,  2,  2,  2,  4,  4,  2,  2,  2,
        2,  2,  2,  2,  2,  4,  4, 10,  2,  2,  2,  2,  2,  2,  2,  4,  4,
        2,  2,  2,  2,  2,  2,  2,  2,  4,  4,  2,  2,  2,  2,  2,  2,  2,
        2,  4,  4,  2,  2,  2,  2,  2,  2,  2,  2,  4,  4,  2,  2,  2,  2,
        2,  2,  2,  2,  4,  4, 10, 10,  2,  2,  2,  2,  2,  2,  4,  4, 10,
        2,  2,  2,  2,  2

### Classifier method using XGBOOST

In [60]:
xgb_clf = XGBClassifier()
xgb_clf.fit(train_set, y_train)

score = xgb_clf.score(test_set, y_test)
print(score)

xgpredictions = xgb_clf.predict(test_set)
xgpredictions
#accuracy = accuracy_score(y_test, xgpredictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.10256410256410256


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


array([ 2,  5,  2,  7,  2,  4, 12,  4,  4,  4,  2,  2,  4,  2,  8, 12,  4,
        2,  4,  4,  4,  4,  2,  4,  4, 12,  4,  2,  4,  4,  2,  4,  2,  4,
        4, 12,  4,  2,  4,  4,  2,  2,  4,  4,  4,  2,  2,  2,  4,  4,  4,
        4,  4,  4,  8, 12,  4,  2,  4,  2,  4,  2,  2,  2,  4,  4,  5,  2,
        4,  6,  2,  5,  4,  4,  4,  4, 12,  4,  2,  4,  2,  2,  4,  4,  4,
        4, 12,  4,  2,  2,  2,  5,  4,  4,  4, 12,  4,  2,  4,  6,  2,  4,
        4,  4,  4,  4,  4,  2,  4,  6,  5,  2,  4,  4,  5,  4,  2,  2,  4,
        4,  6,  6,  4,  4,  4,  4,  2,  2,  4,  6,  6,  4,  4,  4,  8,  4,
        4,  2,  4,  6,  6,  6,  6,  2,  4,  4,  4,  2,  4,  4,  4,  4,  4,
        4,  4,  4,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  2,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        2,  4,  4,  4,  6,  6,  4,  4,  4,  4,  6,  6,  6,  6,  6,  4,  6,
        6,  4,  4,  4,  6,  6,  6,  6,  6,  6,  2,  4,  4,  4,  4,  6,  6,
        6,  6,  6,  6,  6

In [39]:
np.array(y_test)

array([ 1,  7, 10, 10,  7,  4,  2,  3,  2,  2,  1,  3,  1,  4,  4,  4,  3,
        2,  1,  5,  7,  2,  8,  8,  2, 12,  2,  6, 12,  8,  2,  3,  3,  4,
        5,  6,  8,  6,  4,  4,  7, 12,  1,  8,  6, 12,  5,  4,  8,  8,  7,
        7,  5,  2,  6,  6,  5,  6,  8,  4,  7,  4,  2, 12,  4,  4,  8,  7,
        6,  4,  6,  2, 12, 10,  7,  8,  2,  3,  7,  6,  5, 10,  8,  7,  7,
        8,  1,  3,  8,  3,  6,  5,  4,  5, 12,  8,  6, 10, 10,  6, 12,  6,
        6, 12,  5,  5,  8,  2, 12,  8, 12,  7,  2,  7,  3,  8, 10, 12, 10,
        5,  4,  7, 12,  3, 12,  4,  1,  6, 10,  1,  6,  5,  3,  5, 10,  8,
        3,  1,  5,  3,  5, 10,  5,  8,  6, 10, 12,  2,  5,  1, 10,  7,  8,
       10,  8, 12, 10, 10,  6,  5,  8,  8, 10,  8,  7, 10,  5,  6,  7,  7,
       10, 12,  8,  8,  7, 10,  7, 10, 10, 12, 12, 10, 10,  3, 10,  2, 12,
        3,  6,  3,  7,  5,  1,  4, 10,  5, 10, 12,  4, 12,  3,  5, 12,  6,
        2, 12,  4,  3,  2,  4,  7,  1,  3, 10, 12,  8,  2,  2,  1,  2,  3,
        3,  4,  5,  3,  2

Poor prediction 

### Gradient Boosting Regression Method

Here we consider the points as a continuous variable

In [61]:
#Encoding string class values as integers
le = preprocessing.LabelEncoder()
df2=df
df2['(semi-) final'] = le.fit_transform(df2['(semi-) final'].values)
df2['Jury or Televoting'] = le.fit_transform(df2['Jury or Televoting'].values)
df2['From country'] = le.fit_transform(df2['From country'].values)
df2['To country'] = le.fit_transform(df2['To country'].values)

#Train set
train_set = df2[df2['Year'] < 2022]
y_train = train_set['Points      ']
train_set.drop(labels='Points      ', axis=1, inplace=True)

#Test set
test_set = df2[df2['Year'] == 2022]
y_test = test_set['Points      ']
test_set.drop(labels='Points      ', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.drop(labels='Points      ', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.drop(labels='Points      ', axis=1, inplace=True)


In [62]:
params = {'n_estimators' : 4, 'max_depth' : 3, 'learning_rate' : 0.5, 'criterion':'mse'}
gb_reg = ensemble.GradientBoostingRegressor(**params)
gb_reg.fit(train_set, y_train)
print("The mean accuracy score is:")
print(gb_reg.score(train_set,y_train)) #Mean accuracy

predictions_reg = np.round(gb_reg.predict(test_set))
print(predictions_reg)

The mean accuracy score is:
0.12867511140571475
[5. 5. 5. 5. 5. 6. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5.
 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5.
 7. 5. 5. 5. 6. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5.
 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5.
 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 6. 5. 5. 5. 7. 5.
 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 6. 5. 5. 5. 5. 5. 5. 5. 7. 5. 6. 5. 5. 5.
 5. 5. 5. 5. 7. 5. 5. 5. 6. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5.
 7. 5. 5. 5. 6. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5.
 6. 5. 5. 5. 5. 5. 7. 5. 5. 6. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5.
 5. 5. 7. 4. 5. 5. 5. 6. 5. 5. 5. 5. 7. 4. 5. 5. 5. 5. 5. 5. 5. 5. 7. 4.
 5. 5. 5. 5. 5. 5. 5. 5. 7. 4. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5.
 5. 5. 5. 5. 7. 4. 5. 5. 5. 5. 5. 5. 5. 5. 5. 7. 5. 5. 5. 5. 5. 5. 5. 5.
 7. 4. 5. 5. 5. 5. 5. 5. 5. 5. 7. 4. 5. 5. 5. 6. 5. 5. 5. 5. 7. 4. 5. 5.
 5.



In [47]:
np.array(y_test)

array([ 1,  7, 10, 10,  7,  4,  2,  3,  2,  2,  1,  3,  1,  4,  4,  4,  3,
        2,  1,  5,  7,  2,  8,  8,  2, 12,  2,  6, 12,  8,  2,  3,  3,  4,
        5,  6,  8,  6,  4,  4,  7, 12,  1,  8,  6, 12,  5,  4,  8,  8,  7,
        7,  5,  2,  6,  6,  5,  6,  8,  4,  7,  4,  2, 12,  4,  4,  8,  7,
        6,  4,  6,  2, 12, 10,  7,  8,  2,  3,  7,  6,  5, 10,  8,  7,  7,
        8,  1,  3,  8,  3,  6,  5,  4,  5, 12,  8,  6, 10, 10,  6, 12,  6,
        6, 12,  5,  5,  8,  2, 12,  8, 12,  7,  2,  7,  3,  8, 10, 12, 10,
        5,  4,  7, 12,  3, 12,  4,  1,  6, 10,  1,  6,  5,  3,  5, 10,  8,
        3,  1,  5,  3,  5, 10,  5,  8,  6, 10, 12,  2,  5,  1, 10,  7,  8,
       10,  8, 12, 10, 10,  6,  5,  8,  8, 10,  8,  7, 10,  5,  6,  7,  7,
       10, 12,  8,  8,  7, 10,  7, 10, 10, 12, 12, 10, 10,  3, 10,  2, 12,
        3,  6,  3,  7,  5,  1,  4, 10,  5, 10, 12,  4, 12,  3,  5, 12,  6,
        2, 12,  4,  3,  2,  4,  7,  1,  3, 10, 12,  8,  2,  2,  1,  2,  3,
        3,  4,  5,  3,  2

### XGBOOST Regression Method

In [63]:
reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.5,
                    max_depth = 10, alpha = 10, n_estimators = 10)
reg.fit(train_set,y_train)
y_pred = reg.predict(test_set)
np.round(y_pred)



  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


array([5., 5., 5., 6., 6., 5., 6., 5., 6., 4., 5., 5., 6., 6., 6., 6., 5.,
       5., 6., 4., 6., 6., 6., 6., 6., 6., 5., 5., 6., 4., 5., 6., 6., 6.,
       6., 6., 5., 5., 6., 4., 5., 5., 6., 6., 6., 6., 5., 5., 6., 4., 6.,
       6., 5., 6., 6., 6., 5., 5., 6., 4., 5., 5., 6., 6., 6., 6., 6., 5.,
       6., 4., 5., 5., 6., 6., 6., 6., 6., 5., 5., 6., 5., 5., 6., 6., 6.,
       6., 6., 5., 6., 4., 5., 5., 6., 6., 6., 6., 5., 5., 6., 4., 5., 5.,
       6., 6., 6., 6., 5., 5., 6., 4., 5., 5., 6., 6., 5., 6., 5., 5., 6.,
       4., 5., 5., 6., 6., 6., 6., 5., 5., 6., 4., 5., 6., 6., 6., 6., 6.,
       6., 5., 6., 4., 5., 5., 5., 5., 6., 6., 6., 5., 6., 4., 6., 6., 5.,
       6., 6., 6., 5., 5., 6., 4., 5., 6., 6., 6., 6., 6., 5., 5., 6., 4.,
       6., 6., 5., 6., 6., 6., 5., 5., 6., 4., 6., 6., 6., 6., 6., 6., 5.,
       5., 6., 4., 6., 6., 5., 6., 6., 6., 5., 5., 6., 4., 6., 5., 6., 6.,
       6., 6., 6., 5., 6., 4., 5., 5., 6., 6., 6., 6., 5., 5., 6., 4., 5.,
       6., 6., 5., 6., 6.