In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Union, Callable
import random

In [2]:
file_path = Path(r'C:\PythonProjects\social-media-preferences\03_Clustering_Marketing.csv')

In [3]:
df = pd.read_csv(file_path, keep_default_na=False)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df_int_stats = df.select_dtypes('int').describe()

In [None]:
for col in df.select_dtypes('object').columns:
    display(df[col].unique())

In [None]:
for col in df.select_dtypes('object').columns:
    display(f"Column '{col}', number of missing values: {df[col].isna().sum()}")


In [None]:
df['age']\
    .where(df['age'].str.contains(pat='[a-z]'))\
        .sort_values(ascending=False)\
            .dropna(how='all')

In [None]:
coin_flip(chanches=0.6)

In [None]:
df_int_stats

In [None]:
df_int_stats.loc[['min', 'mean', 'max', 'std'],]

In [None]:
RAW_FEATURES = [
    "gradyear", "age", "NumberOffriends", "basketball",
    "football", "soccer", "softball", "volleyball", "swimming", "cheerleading",
    "baseball", "tennis", "sports", "cute", "sex", "sexy", "hot", "kissed",
    "dance", "band", "marching", "music", "rock", "god", "church", "jesus",
    "bible", "hair", "dress", "blonde", "mall", "shopping", "clothes",
    "hollister", "abercrombie", "die", "death", "drunk", "drugs", "gender"
]

In [4]:
def coin_flip(chanches: float = 0.80) -> bool:
    if random.random() < chanches:
        return True
    else:
        return False

In [191]:
def produce_outliers(value: str | int | None) -> str | int:
    print(f'Producing outlier, input value={value}')
    if str(value).isalpha():
        #print(f'Output value={value}')
        return value
    elif value is None:
        #print(f'Output value={value}')
        return value
    elif str(value).isnumeric() and value == 0:
        #print(f'Output value={value}')
        return np.random.randint(-100, -1)
    elif str(value).isnumeric():
        #print(f'Output value={value}')
        return -value

In [192]:
def set_string(value: str | int | None) -> str:
    print(f'Setting as String, input value={value}')
    if str(value).isalpha():
        #print(f'Output value={value}')
        return value
    elif str(value).isnumeric():
        #print(f'Output value={value}')
        return str(value)

In [193]:
def set_numeric(value: str | int | None) -> str | int:
    print(f'Setting numerical value, input value={value}')
    if str(value).isalpha():
        #print(f'Output value={value}')
        return value
    elif value is None:
        #print(f'Output value={value}')
        return pd.NA
    elif isinstance(value, str):
        return np.random.randint(0, 2009)

In [194]:
def produce_nulls(value: str | int | None) -> pd.NA:
    print('Producing Null value')
    value = pd.NA
    return value

In [197]:
def new_categories(value: str) -> str:
    if random.random() > 0.1:
        if value in ['F', 'M', 'NA']:
            print(f'Producing new category for gender, input value={value}')
            return np.random.choice(['B', 'NB', 'FTM', 'MTF', 'OTHER', ''])
        else:
            return produce_nulls(value)

In [None]:
def grad_year_issues(value: int) -> int | None:
    lower_grad_bound = np.random.randint(1900, 2005)
    upper_grad_bound = np.random.randint(2010, 2025)
    
    if random.random() > 0.3:
        print('Producing grad out of bounds values')
        return np.random.choice([lower_grad_bound, upper_grad_bound])
    else:
        if random.random() > 0.5:
            print('Producing null')
            return produce_nulls(value)
        else:
            print('Producing outliers')
            return produce_outliers(value)


In [None]:
def age_data_issues(value: float) -> int | None:
    random_old_age = np.random.randint(100, 150)
    if random.random() > 0.3:
        print('Producing unrealistic ages')
        return random_old_age
    else:
        if random.random() > 0.5:
            print('Producing null')
            return produce_nulls(value)
        else:
            print('Producing outliers')
            return produce_outliers(value)

In [None]:
def number_of_friends_issues(value: int) -> int | None:
    rand_num = np.random.randint(606, 1000)
    if random.random() > 0.5:
        print('Producing more num of friends')
        return rand_num
    else:
        print('Producing null')
        return produce_nulls(value)

In [None]:
def new_feature(dataframe: pd.DataFrame) -> pd.DataFrame:
    protected_cols = ['gradyear', 'gender', 'age', 'NumberOffriends', 'volleyball', 'drunk', 'drugs']
    rename_candidates = [col for col in dataframe.columns if col not in protected_cols]

    new_features = ['iphone', 'highschool', 'summer', 'videogames', 'youtube', 'lol', 'tacos', 'tequila']
    random_new_feature = np.random.choice(new_features)

    if random.random() > 0.5:
        if rename_candidates:
            current_col_name = np.random.choice(rename_candidates)
            dataframe = dataframe.rename(columns={current_col_name: random_new_feature})
    else:
        dataframe[random_new_feature] = np.random.choice([0, 1], size=len(dataframe))

    return dataframe



In [223]:
issues_dict = {
    1: produce_nulls,
    2: new_categories,
    3: set_numeric,
    4: set_string,
    5: produce_outliers,
}

In [203]:
def apply_funcs(dataframe: pd.DataFrame, row: int, funcs) -> pd.DataFrame:
    random_col = np.random.choice([
        'gradyear', 'gender', 'age', 
        'NumberOffriends', 'volleyball', 'drunk', 'drugs'
    ])
    
    col_idx = dataframe.columns.get_loc(random_col)
    current_value = dataframe.iat[row, col_idx]

    if random.random() > 0.3:
        if random_col == 'gradyear':
            data_issue = grad_year_issues(current_value)
        elif random_col == 'gender':
            data_issue = new_categories(current_value)
        elif random_col == 'age':
            data_issue = age_data_issues(current_value)
        elif random_col == 'NumberOffriends':
            data_issue = number_of_friends_issues(current_value)
        elif random_col in ['volleyball', 'drunk', 'drugs']:
            rand_func = np.random.choice(funcs)
            data_issue = rand_func(current_value)
        
        dataframe.iat[row, col_idx] = data_issue
        print(f"Affected column = {random_col}, affected row = {row}")
    else:
        print('Producing new feature')
        dataframe = new_feature(dataframe)

    return dataframe


In [229]:
def set_data_issues(dataframe: pd.DataFrame, affected_rows: dict) -> pd.DataFrame:
    check_key_value_match = [k for (k, v) in affected_rows.items() if k == v]
    print(f'Dropping col? {'yes' if check_key_value_match else 'no'}')
    if check_key_value_match:
        col_number = np.random.randint(0, 39)
        return dataframe.drop(dataframe.columns[col_number], axis=1)
    else:
        for k, v in affected_rows.items():
            pick_random_issue = np.random.choice(np.arange(1, 6), size=5, replace=False)
            funcs = [func for (k, func) in issues_dict.items() if k in pick_random_issue]
            #print(f'Sending the following funcs: {funcs}')
            #print(f'Row {k}')
            dataframe = apply_funcs(dataframe, funcs=funcs, row=k)
            print('.........................................................')
        return dataframe

In [210]:
def determine_affected_rows(dataframe: pd.DataFrame) -> dict:
    number_of_rows = dataframe.shape[0] - 1
    print(f'Dataframe shape {dataframe.shape}')
    afected_rows = np.random.randint(1, number_of_rows, size=number_of_rows)
    set_issues_dict = {}
    for idx in afected_rows:
        if idx not in set_issues_dict.keys():
            set_issues_dict[idx] = 1
        else:
            set_issues_dict[idx] += 1
    return {k:v for (k, v) in set_issues_dict.items() if v > 1}

In [211]:
def generate_data_issues(dataframe: pd.DataFrame = None, chances: float = None) -> pd.DataFrame:
    if coin_flip(chances):
        affected_rows = determine_affected_rows(dataframe)
        dataframe = set_data_issues(dataframe, affected_rows)
        return dataframe
    else:
        return dataframe

In [217]:
df_test = df.copy().sample(50)
df_test_output = generate_data_issues(dataframe=df_test, chances=0.9)
df_test_output
df_test_output.columns

Dataframe shape (50, 40)
Dropping col? no
Producing unrealistic ages
Affected column = age, affected row = 11
Producing new feature
Producing new feature
Producing new feature
Producing unrealistic ages
Affected column = age, affected row = 9
Producing more num of friends
Affected column = NumberOffriends, affected row = 39
Producing new feature
Producing null
Producing Null value
Affected column = age, affected row = 48
Producing grad out of bounds values
Affected column = gradyear, affected row = 46
Producing Null value
Affected column = volleyball, affected row = 8
Producing Null value
Affected column = volleyball, affected row = 35


Index(['gradyear', 'gender', 'age', 'NumberOffriends', 'basketball',
       'football', 'soccer', 'softball', 'volleyball', 'swimming',
       'cheerleading', 'baseball', 'tennis', 'sports', 'cute', 'sex', 'sexy',
       'hot', 'kissed', 'dance', 'band', 'marching', 'videogames', 'rock',
       'god', 'church', 'jesus', 'bible', 'hair', 'dress', 'blonde', 'mall',
       'shopping', 'clothes', 'hollister', 'abercrombie', 'die', 'death',
       'drunk', 'drugs', 'tacos', 'iphone'],
      dtype='object')

In [230]:
for i in range(200):
    df_test = df.copy().sample(50)
    df_test_output = generate_data_issues(dataframe=df_test, chances=0.9)
    print('______________________________________________________')

Dataframe shape (50, 40)
Dropping col? no
Producing Null value
Affected column = volleyball, affected row = 36
.........................................................
Producing new feature
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 4
.........................................................
Producing new feature
.........................................................
Producing Null value
Affected column = drugs, affected row = 23
.........................................................
Producing unrealistic ages
Affected column = age, affected row = 48
.........................................................
Producing new feature
.........................................................
Producing new feature
.........................................................
Setting numerical value, input value=0
Affected column = drunk, affected row = 7
....................................

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


Dataframe shape (50, 40)
Dropping col? no
Producing more num of friends
Affected column = NumberOffriends, affected row = 15
.........................................................
Producing Null value
Affected column = volleyball, affected row = 26
.........................................................
Producing Null value
Affected column = volleyball, affected row = 20
.........................................................
Producing new feature
.........................................................
Producing new feature
.........................................................
Producing new feature
.........................................................
Setting as String, input value=0.0
Affected column = volleyball, affected row = 40
.........................................................
Producing new category for gender, input value=F
Affected column = gender, affected row = 11
.........................................................
Producing Null value
Affected c

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


.........................................................
Producing new category for gender, input value=F
Affected column = gender, affected row = 4
.........................................................
Producing new feature
.........................................................
Producing new feature
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 35
.........................................................
Producing Null value
Affected column = drugs, affected row = 5
.........................................................
Producing Null value
Affected column = volleyball, affected row = 25
.........................................................
Producing new feature
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 15
.........................................................
Producing 

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


Dataframe shape (50, 40)
Dropping col? no
Setting as String, input value=0
Affected column = volleyball, affected row = 18
.........................................................
Producing outlier, input value=0
Affected column = volleyball, affected row = 48
.........................................................
Producing new feature
.........................................................
Producing Null value
Affected column = drugs, affected row = 31
.........................................................
Producing new feature
.........................................................
Producing unrealistic ages
Affected column = age, affected row = 16
.........................................................
Producing null
Producing Null value
Affected column = gradyear, affected row = 3
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 22
............................................

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


Dataframe shape (50, 40)
Dropping col? yes
______________________________________________________
Dataframe shape (50, 40)
Dropping col? no
Producing new feature
.........................................................
Producing Null value
Affected column = drunk, affected row = 15
.........................................................
Affected column = gender, affected row = 19
.........................................................
Producing Null value
Affected column = volleyball, affected row = 22
.........................................................
Setting numerical value, input value=0
Affected column = drugs, affected row = 32
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 11
.........................................................
Producing new feature
.........................................................
Producing Null value
Affected column = drugs, affected row = 3

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


Affected column = drunk, affected row = 14
.........................................................
Producing unrealistic ages
Affected column = age, affected row = 33
.........................................................
Producing new feature
.........................................................
Producing null
Producing Null value
Affected column = age, affected row = 18
.........................................................
Setting numerical value, input value=0
Affected column = volleyball, affected row = 6
.........................................................
Producing new category for gender, input value=F
Affected column = gender, affected row = 27
.........................................................
Producing Null value
Affected column = volleyball, affected row = 31
.........................................................
______________________________________________________
Dataframe shape (50, 40)
Dropping col? no
Producing new feature
.................

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue


Producing null
Producing Null value
Affected column = age, affected row = 6
.........................................................
______________________________________________________
Dataframe shape (50, 40)
Dropping col? no
Setting numerical value, input value=0
Affected column = volleyball, affected row = 5
.........................................................
Producing grad out of bounds values
Affected column = gradyear, affected row = 42
.........................................................
Producing Null value
Affected column = volleyball, affected row = 44
.........................................................
Producing Null value
Affected column = drugs, affected row = 11
.........................................................
Producing null
Producing Null value
Affected column = NumberOffriends, affected row = 6
.........................................................
Producing new feature
.........................................................
Producing 

  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
  dataframe.iat[row, col_idx] = data_issue
