In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
df = pd.read_csv('anxiety.csv', encoding='latin1')

In [3]:
# Display the first 5 rows of dataset
print(df.head())

   S. No.    Timestamp  GAD1  GAD2  GAD3  GAD4  GAD5  GAD6  GAD7  \
0       1  42052.00437     0     0     0     0     1     0     0   
1       2  42052.00680     1     2     2     2     0     1     0   
2       3  42052.03860     0     2     2     0     0     3     1   
3       4  42052.06804     0     0     0     0     0     0     0   
4       5  42052.08948     2     1     2     2     2     3     2   

                   GADE  ...  Birthplace    Residence  Reference  \
0  Not difficult at all  ...         USA          USA     Reddit   
1    Somewhat difficult  ...         USA          USA     Reddit   
2  Not difficult at all  ...     Germany      Germany     Reddit   
3  Not difficult at all  ...         USA          USA     Reddit   
4        Very difficult  ...         USA  South Korea     Reddit   

                                           Playstyle  accept GAD_T SWL_T  \
0                                       Singleplayer  Accept     1    23   
1              Multiplayer - o

In [4]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Create a new DataFrame with custom column titles
missing_df = pd.DataFrame({'Attribute': missing_values.index, 'Missing Value Count': missing_values.values})

# Display the columns with missing values (if any) and the count of missing values in each
print(missing_values[missing_values > 0])

GADE                 649
Hours                 30
League              1838
highestleague      13464
streams              100
SPIN1                124
SPIN2                154
SPIN3                140
SPIN4                159
SPIN5                166
SPIN6                156
SPIN7                138
SPIN8                144
SPIN9                158
SPIN10               160
SPIN11               187
SPIN12               168
SPIN13               187
SPIN14               156
SPIN15               147
SPIN16               147
SPIN17               175
Narcissism            23
Work                  38
Reference             15
accept               414
SPIN_T               650
Residence_ISO3       110
Birthplace_ISO3      121
dtype: int64


In [5]:
# Drop rows with missing values (NaN)
df_cleaned = df.dropna()

# Now df_cleaned contains the DataFrame with missing values dropped

In [8]:
# Check for missing values in each column
missing_values = df_cleaned.isna().sum()

# Display the count of missing values in each column
print(missing_values)


S. No.             0.0
Timestamp          0.0
GAD1               0.0
GAD2               0.0
GAD3               0.0
GAD4               0.0
GAD5               0.0
GAD6               0.0
GAD7               0.0
GADE               0.0
SWL1               0.0
SWL2               0.0
SWL3               0.0
SWL4               0.0
SWL5               0.0
Game               0.0
Platform           0.0
Hours              0.0
earnings           0.0
whyplay            0.0
League             0.0
highestleague      0.0
streams            0.0
SPIN1              0.0
SPIN2              0.0
SPIN3              0.0
SPIN4              0.0
SPIN5              0.0
SPIN6              0.0
SPIN7              0.0
SPIN8              0.0
SPIN9              0.0
SPIN10             0.0
SPIN11             0.0
SPIN12             0.0
SPIN13             0.0
SPIN14             0.0
SPIN15             0.0
SPIN16             0.0
SPIN17             0.0
Narcissism         0.0
Gender             0.0
Age                0.0
Work       

In [9]:
# Rename columns
df.rename(columns={'S. No.': 'Student.No', 'Work': 'Employment_Status', 'Degree': 'Education_Level', 'GAD_T': 'Generalized_Anxiety_Disorder_Scores', 'SWL_T': 'Satisfaction_With_Life_Scale_Scores', 'SPIN_T': 'Social_Phobia_Inventory_Scores', 'Residence_ISO3': 'Place_of_Residence', 'Birthplace_ISO3': 'Birthplace'}, inplace=True)

In [10]:
# List of columns to drop
columns_to_drop = ['earnings', 'League', 'highestleague', 'streams', 'Reference', 'accept']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

In [11]:
attributes = df.columns
print(attributes)

Index(['Student.No', 'Timestamp', 'GAD1', 'GAD2', 'GAD3', 'GAD4', 'GAD5',
       'GAD6', 'GAD7', 'GADE', 'SWL1', 'SWL2', 'SWL3', 'SWL4', 'SWL5', 'Game',
       'Platform', 'Hours', 'whyplay', 'SPIN1', 'SPIN2', 'SPIN3', 'SPIN4',
       'SPIN5', 'SPIN6', 'SPIN7', 'SPIN8', 'SPIN9', 'SPIN10', 'SPIN11',
       'SPIN12', 'SPIN13', 'SPIN14', 'SPIN15', 'SPIN16', 'SPIN17',
       'Narcissism', 'Gender', 'Age', 'Employment_Status', 'Education_Level',
       'Birthplace', 'Residence', 'Playstyle',
       'Generalized_Anxiety_Disorder_Scores',
       'Satisfaction_With_Life_Scale_Scores', 'Social_Phobia_Inventory_Scores',
       'Place_of_Residence', 'Birthplace'],
      dtype='object')


In [12]:
# EDA (Exploratory Data Analysis)
# Summary Statistics
summary_stats = df.describe

# Display the summary statistics
print(summary_stats)

<bound method NDFrame.describe of        Student.No    Timestamp  GAD1  GAD2  GAD3  GAD4  GAD5  GAD6  GAD7  \
0               1  42052.00437     0     0     0     0     1     0     0   
1               2  42052.00680     1     2     2     2     0     1     0   
2               3  42052.03860     0     2     2     0     0     3     1   
3               4  42052.06804     0     0     0     0     0     0     0   
4               5  42052.08948     2     1     2     2     2     3     2   
...           ...          ...   ...   ...   ...   ...   ...   ...   ...   
13459       14246  42057.75678     1     0     0     1     0     1     1   
13460       14247  42057.81185     3     3     3     3     2     3     3   
13461       14248  42058.16964     0     0     0     0     0     0     0   
13462       14249  42058.24420     3     2     1     3     0     1     3   
13463       14250  42058.36375     1     1     0     0     0     0     0   

                       GADE  ...                Emplo

In [13]:
# Assuming 'cleaned_data.csv' is the desired file name for the exported data
df.to_excel('latest-cleaned-anxiety-data.xlsx', index=False)