In [1]:
import pandas as pd

In [2]:
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

In [3]:
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m1_survey_data.csv"

In [4]:
await download(file_path, "m1_survey_data.csv")
file_name="m1_survey_data.csv"

In [5]:
df = pd.read_csv(file_name)

In [6]:
#df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m1_survey_data.csv")

In [11]:
duplicates = df[df.duplicated()]

In [12]:
print(duplicates)

      Respondent                                         MainBranch Hobbyist  \
1168        2339                     I am a developer by profession      Yes   
1169        2342                     I am a developer by profession      Yes   
1170        2343                     I am a developer by profession      Yes   
1171        2344                     I am a developer by profession      Yes   
1172        2347                     I am a developer by profession      Yes   
...          ...                                                ...      ...   
2297        4674  I am not primarily a developer, but I write co...      Yes   
2298        4675                     I am a developer by profession      Yes   
2299        4676                     I am a developer by profession      Yes   
2300        4677                     I am a developer by profession      Yes   
2301        4679                     I am a developer by profession      Yes   

                                       

In [13]:
df_no_duplicates = df.drop_duplicates()

In [14]:
initial_row_count = df.shape[0]

In [15]:
final_row_count = df_no_duplicates.shape[0]


In [16]:
print(f"Number of rows before removing duplicates: {initial_row_count}")


Number of rows before removing duplicates: 11552


In [17]:
print(f"Number of rows after removing duplicates: {final_row_count}")


Number of rows after removing duplicates: 11398


In [18]:
missing_values = df.isnull().sum()


In [19]:
print(missing_values)


Respondent        0
MainBranch        0
Hobbyist          0
OpenSourcer       0
OpenSource       81
               ... 
Sexuality       547
Ethnicity       683
Dependents      144
SurveyLength     19
SurveyEase       14
Length: 85, dtype: int64


In [20]:
missing_workloc_count = df['WorkLoc'].isnull().sum()


In [21]:
print(f"Number of missing values in 'WorkLoc': {missing_workloc_count}")


Number of missing values in 'WorkLoc': 32


In [22]:
workloc_value_counts = df['WorkLoc'].value_counts(dropna=False)


In [23]:
print(workloc_value_counts)


Office                                            6905
Home                                              3638
Other place, such as a coworking space or cafe     977
NaN                                                 32
Name: WorkLoc, dtype: int64


In [24]:
workloc_value_counts = df['WorkLoc'].value_counts()


In [25]:
most_frequent_workloc = workloc_value_counts.idxmax()


In [26]:
most_frequent_count = workloc_value_counts.max()


In [27]:
print(f"The most frequent value in 'WorkLoc' is: {most_frequent_workloc}")


The most frequent value in 'WorkLoc' is: Office


In [28]:
print(f"Count of the most frequent value: {most_frequent_count}")


Count of the most frequent value: 6905


In [29]:
workloc_value_counts = df['WorkLoc'].value_counts()


In [30]:
most_frequent_workloc = workloc_value_counts.idxmax()


In [31]:
df['WorkLoc'].fillna(most_frequent_workloc, inplace=True)


In [32]:
missing_workloc_count = df['WorkLoc'].isnull().sum()


In [33]:
if missing_workloc_count == 0:
    print("All missing values in 'WorkLoc' have been imputed.")
else:
    print(f"There are still {missing_workloc_count} missing values in 'WorkLoc'.")

All missing values in 'WorkLoc' have been imputed.


In [34]:
compfreq_categories = df['CompFreq'].unique()


In [35]:
print("Categories in 'CompFreq':")
for category in compfreq_categories:
    print(category)

Categories in 'CompFreq':
Yearly
Monthly
Weekly
nan


In [36]:
def normalize_compensation(row):
    compensation = row['CompTotal']
    freq = row['CompFreq']

In [None]:
 if pd.isna(compensation) or pd.isna(freq):
        return None
    
    if freq == 'Yearly':
        return compensation
    elif freq == 'Monthly':
        return compensation * 12
    elif freq == 'Weekly':
        return compensation * 52
    else:
        return None  # Handle other cases if needed

In [38]:
df['NormalizedAnnualCompensation'] = df.apply(normalize_compensation, axis=1)


In [39]:
print(df.head())


   Respondent                      MainBranch Hobbyist  \
0           4  I am a developer by profession       No   
1           9  I am a developer by profession      Yes   
2          13  I am a developer by profession      Yes   
3          16  I am a developer by profession      Yes   
4          17  I am a developer by profession      Yes   

                                         OpenSourcer  \
0                                              Never   
1                         Once a month or more often   
2  Less than once a month but more than once per ...   
3                                              Never   
4  Less than once a month but more than once per ...   

                                          OpenSource          Employment  \
0  The quality of OSS and closed source software ...  Employed full-time   
1  The quality of OSS and closed source software ...  Employed full-time   
2  OSS is, on average, of HIGHER quality than pro...  Employed full-time   
3  The qua