In [290]:
import pandas as pd
import matplotlib.pyplot as plt

In [292]:
race_mapping = {
    8516: "Black",
    8527: "White",
    8657: "Native American", 
    8515: "Asian",
    8557: "Native Hawaiian",
    0: "Unknown"
}

ethnicity_mapping = {
    38003564: "Not Hispanic or Latino",
    38003563: "Hispanic or Latino",
    0: "Unknown"
}

In [294]:
age_bins = [15, 20, 25, 30, 35, 40, 45, 50]
age_labels = ['15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49']



In [296]:
df = pd.read_csv('cohort_3_lin.csv')

In [298]:
df['race_concept_id'] =  df['race_concept_id'].map(race_mapping)
df['ethnicity_concept_id'] = df['ethnicity_concept_id'].map(ethnicity_mapping)

In [300]:
df['age_range'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)

In [302]:
df.to_csv('updated_cohort_3_lin.csv', index = False)

In [304]:
print(df)

      Unnamed: 0  subject_id  year_of_birth  age race_concept_id  \
0              0     6146755           1970   48           Black   
1              1     6224673           1969   49           White   
2              2     3217106           1979   39         Unknown   
3              3     4803844           1974   44         Unknown   
4              4      904744           1973   45           White   
...          ...         ...            ...  ...             ...   
7347        7347     2081482           1972   46           Asian   
7348        7348     2083922           2002   16         Unknown   
7349        7349     2090068           1983   35           White   
7350        7350     2091318           1977   41         Unknown   
7351        7351     2091938           1985   33         Unknown   

        ethnicity_concept_id  inpt_ct  ed_ct  outpt_ct  class  output  \
0     Not Hispanic or Latino        0      0         0      1       0   
1     Not Hispanic or Latino        0

In [306]:
filtered_df = df.query('output == 1')
print(filtered_df)

      Unnamed: 0  subject_id  year_of_birth  age race_concept_id  \
8              8     2826870           1977   41           White   
9              9     5870778           1977   41         Unknown   
11            11     5704118           1970   48         Unknown   
14            14     2930218           1980   38           White   
15            15     5995498           1974   44           Asian   
...          ...         ...            ...  ...             ...   
7335        7335     2060275           1978   40         Unknown   
7341        7341     2069556           1939   79           White   
7343        7343     2070744           1976   42         Unknown   
7348        7348     2083922           2002   16         Unknown   
7349        7349     2090068           1983   35           White   

        ethnicity_concept_id  inpt_ct  ed_ct  outpt_ct  class  output  \
8     Not Hispanic or Latino        0      0         0      1       1   
9                    Unknown        0

In [308]:
total_population = len(filtered_df)
race_count = filtered_df['race_concept_id'].value_counts()
ethnicity_count = filtered_df['ethnicity_concept_id'].value_counts()
age_count = filtered_df['age_range'].value_counts().sort_index()
print(race_count)
print()
print(ethnicity_count)

race_concept_id
Unknown            678
White              484
Black              288
Asian               80
Native Hawaiian     12
Native American      3
Name: count, dtype: int64

ethnicity_concept_id
Not Hispanic or Latino    676
Hispanic or Latino        587
Unknown                   289
Name: count, dtype: int64


In [310]:
race_statistics = pd.DataFrame({'Count': race_count, 'Percentage': ((race_count/total_population) * 100)})
ethnicity_statistics = pd.DataFrame({'Count': ethnicity_count, 'Percentage': ((ethnicity_count/total_population) * 100)})
age_statistics = pd.DataFrame({'Count': age_count, 'Percentage': ((age_count/total_population) * 100)})

print(race_statistics)
print(ethnicity_statistics)
print(age_statistics)

                 Count  Percentage
race_concept_id                   
Unknown            678   43.685567
White              484   31.185567
Black              288   18.556701
Asian               80    5.154639
Native Hawaiian     12    0.773196
Native American      3    0.193299
                        Count  Percentage
ethnicity_concept_id                     
Not Hispanic or Latino    676   43.556701
Hispanic or Latino        587   37.822165
Unknown                   289   18.621134
           Count  Percentage
age_range                   
15-19         39    2.512887
20-24         59    3.801546
25-29         82    5.283505
30-34        181   11.662371
35-39        385   24.806701
40-44        417   26.868557
45-49        321   20.682990
