In [1]:
import pandas as pd
import re
from sklearn.cluster import KMeans

In [2]:
def load_data():
    olympics = pd.read_csv('files/olympics.csv', skiprows = 1, encoding = 'utf-8')
    olympics.rename(columns = {'Unnamed: 0':'Country_name'}, inplace = True)
    for i in olympics.columns:
        if bool(re.match(".*01 !.*", i)):
            olympics.rename(columns = {i:re.sub('01 !', 'Gold', i)}, inplace = True)
        elif bool(re.match(".*02 !.*", i)):
            olympics.rename(columns = {i:re.sub('02 !', 'Silver', i)}, inplace = True)
        elif bool(re.match(".*03 !.*", i)):
            olympics.rename(columns = {i:re.sub('03 !', 'Bronze', i)}, inplace = True)
    olympics['Country_name'] = olympics['Country_name'].apply(lambda x: x.split("\xa0")[0])
    olympics.index = olympics['Country_name']
    olympics.drop("Totals", axis = 0, inplace = True)
    return olympics

def first_country(df):
    return df.iloc[0,:]

def gold_medal(df):
    return df['Country_name'][df['Gold.2']==df['Gold.2'].max()][0]


def biggest_difference_in_gold_medal(df):
    return df['Country_name'][abs(df['Total']-df['Total.1'])==abs(df['Total']-df['Total.1']).max()][0]

def get_points(df):
    df['points'] = (df['Gold.2']*3+df['Silver.2']*2+df['Bronze.2'])
    return df['points']

def k_means(df):
    km = KMeans(n_clusters = 3, init = 'k-means++').fit(get_points(df).reshape(-1, 1))
    return km.cluster_centers_

# Loaded and preprocessed dataframe

In [9]:
df = load_data()
df.head()

Unnamed: 0_level_0,Country_name,# Summer,Gold,Silver,Bronze,Total,# Winter,Gold.1,Silver.1,Bronze.1,Total.1,# Games,Gold.2,Silver.2,Bronze.2,Combined total
Country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Afghanistan,Afghanistan,13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria,Algeria,12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina,Argentina,23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia,Armenia,5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia,Australasia,2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


# Details of the first country in the dataframe.

In [4]:
first_country(df)

Country_name      Afghanistan
# Summer                   13
Gold                        0
Silver                      0
Bronze                      2
Total                       2
# Winter                    0
Gold.1                      0
Silver.1                    0
Bronze.1                    0
Total.1                     0
# Games                    13
Gold.2                      0
Silver.2                    0
Bronze.2                    2
Combined total              2
Name: Afghanistan, dtype: object

# Country with most gold medals

In [5]:
gold_medal(df)

'United States'

# Country with biggest difference between their summer and winter gold medal counts

In [6]:
biggest_difference_in_gold_medal(df)

'United States'

# Weighted points for each country
### One gold medal counts for 3 points
### One silver medal counts for 2 points
### Onr bronze medal counts for 1 point

In [7]:
get_points(df)

Country_name
Afghanistan                            2
Algeria                               27
Argentina                            130
Armenia                               16
Australasia                           22
Australia                            923
Austria                              569
Azerbaijan                            43
Bahamas                               24
Bahrain                                1
Barbados                               1
Belarus                              154
Belgium                              276
Bermuda                                1
Bohemia                                5
Botswana                               2
Brazil                               184
British West Indies                    2
Bulgaria                             411
Burundi                                3
Cameroon                              12
Canada                               846
Chile                                 24
China                               1120
Col

# The k means clustering as done for the weighted points for each country. There are three clusters indicating how successful a cluster of teams are.

In [8]:
k_means(df)



array([[   73.46875   ],
       [ 1162.35294118],
       [ 5684.        ]])