In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from math import sqrt
db_male = "male.csv"
db_female = "female.csv"
male_db = pd.read_csv(db_male)
female_db = pd.read_csv(db_female)

In [None]:
male_db.head()

In [None]:
male_db.tail()

In [None]:
female_db.head()

In [None]:
female_db.tail()

In [None]:
def div_conversion(df, column, div):
    return df[column].apply(lambda x: x/div)

In [None]:

male_db['chestcircumference'] = div_conversion(male_db, 'chestcircumference', 10) 
male_db['waistcircumference'] = div_conversion(male_db, 'waistcircumference', 10) 


female_db['chestcircumference'] = div_conversion(female_db, 'chestcircumference', 10)
female_db['waistcircumference'] = div_conversion(female_db, 'waistcircumference', 10) 

In [None]:
print("*****" + " Male " + "*****")
print(male_db['chestcircumference'], end='\n\n')
print(male_db['waistcircumference'])
print("*****" + " Female " + "*****")
print(female_db['chestcircumference'], end='\n\n')
print(female_db['waistcircumference'])

In [None]:
def prepare_db(df, gender, columns: list, sample_size):
        [list(df.columns).index(column) for column in columns]
        df = df.iloc[:sample_size, [list(df.columns).index(column) for column in columns]].copy()
        df.to_csv(f'{gender}_sample{sample_size}.csv', index=False)

In [None]:
prepare_db(male_db, 'male', ['chestcircumference', 'waistcircumference'], 100)
prepare_db(female_db, 'female', ['chestcircumference', 'waistcircumference'], 100)

In [None]:
male_sizes = {"XXS": 84, "XS": 90, "S": 95, "M": 102, "L": 112, "XL": 122, "XXL": 133, "3XL": 1000}
female_sizes = {"XSS": 75, "XS": 82, "S": 89, "M": 96, "L": 106, "XL": 116, "XXL": 1000}

In [None]:
def categories_by_size(df, sizes):
    cloth_sizes = []
    for row in range(df.shape[0]):
        size = next(key for key in sizes if df['chestcircumference'][row] < sizes[key])
        cloth_sizes.append(size)
    df["clothingsize"] = cloth_sizes
    return df

In [None]:
df_male = pd.read_csv('cleaned_maleDB.csv')
df_male = categories_by_size(df_male, male_sizes)
print(df_male)

df_female = pd.read_csv('cleaned_femaleDB.csv')
df_female = categories_by_size(df_female, female_sizes)
print(df_female)

In [None]:
print("*****" + " Male " + "*****")
df_male = pd.read_csv('cleaned_maleDB.csv')
print(df_male)
print(df_male.columns, '\n')
print(df_male.count())
sns.scatterplot(data=df_male, x='chestcircumference', y='waistcircumference', hue='clothingsize')
plt.show()

In [None]:
print("*****" + " Female " + "*****")
df_female = pd.read_csv('cleaned_femaleDB.csv')
print(df_female)
print(df_female.columns, '\n')
print(df_female.count())
sns.scatterplot(data=df_female, x='chestcircumference', y='waistcircumference', hue='clothingsize')
plt.show()

In [None]:
def k_nearest_neighbor(df, P: list, k: int):
    def vec_abs(vec):
        return sqrt(sum([unit**2 for unit in vec]))
    
    def compute_vec(P_1, P_2):
        return [P_2[col] - P_1[col] for col in range(2)]

    points = []
    for row in range(df.shape[0]):
        point = [df['chestcircumference'][row], df['waistcircumference'][row]]
        vec = compute_vec(P, point)
        points.append((df['clothingsize'][row] ,vec_abs(vec)))

    cluster = [size[0] for size in sorted(points, key=lambda y: y[1])[:k]]
    del points
    cluster_point = {i: cluster.count(i) for i in cluster}
    return max(cluster_point, key=cluster_point.get)

In [None]:
chest = input("Your chest size in CM?:   ")
waist = input("Your waist size in CM?:   ")
gender = input("male / female?        ")
df = pd.read_csv('cleaned_maleDB.csv') if gender == "female" else pd.read_csv('cleaned_femaleDB.csv')
size = k_nearest_neighbor(df, [float(chest), float(waist)], 3)
print(f"You should buy a shirt size: {size}")