# Input

After finding out that Random Oversampling is a potential solution for our imbalanced data, we wanted to implement it in the experiment. Therefore I created a function that lets us implement it easily.

In [44]:
# Imports + dataframe creation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler

# Splitting function from Oscar
def split_classes(n_classes = 3):
    class_labels = []
    
    # Create sorted list of unique values in df['User score']
    current_classes = np.sort(df['User score'].unique()) 
    
    # Split this array in n_classes of same size
    split = np.array_split(current_classes, n_classes) 
    
    for row in df.iterrows(): # Iterate over every row in the dataframe
        for label, class_ranges in enumerate(split): # Iterate over every sub array in split
            if(row[1]['User score'] in class_ranges): # If user score for row is in sub array
                class_labels.append(label) # Append class label to list
    return class_labels # Return list with class labels

df = pd.read_csv('/datc/nano/notebooks/Target variable & Features (V3).csv', index_col = 0)
df['Class'] = split_classes()

In [46]:
# Function
def applyRandomOversampling(data, algorithm):
    
    data = data[data['Threshold method']=='yen'].drop(['Threshold method','User score'],axis=1)
    x = data.drop('Class', axis=1)
    y = data['Class']
    ros = RandomOverSampler(random_state=1)
    x_res, y_res = ros.fit_resample(x,y)
    
    return x_res.join(y_res)  

df_over_yen = applyRandomOversampling(df, 'yen')

In [47]:
#Proof that it works
df['Class'][df['Threshold method']=='yen'].value_counts()
df_over_yen['Class'].value_counts()

2    25
1    25
0    25
Name: Class, dtype: int64

## Output

My work on Random Oversampling was used in the experiment to compare models and balancing methods.