# Environment Setup 

In [39]:
# Install Required Packages
%pip install -r requirements.txt

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('nyts2023.csv')

Note: you may need to restart the kernel to use updated packages.


  df = pd.read_csv('nyts2023.csv')


# Exploratory Data Analysis

In [40]:
# Examine shape
df.shape

(22069, 1469)

In [41]:
# Display columns to inspect the DataFrame
print(list(df.columns))

['artificial_id', 'Non_SOGI_School', 'Location', 'QN1', 'QN2', 'QN3', 'QN4a', 'QN4b', 'QN4c', 'QN4d', 'QN4e', 'QN5a', 'QN5b', 'QN5c', 'QN5d', 'QN5e', 'QN6', 'QN7', 'QN8', 'QN9', 'QN10', 'QN11a', 'QN11b', 'QN11c', 'QN11d', 'QN11e', 'QN11f', 'QN11g', 'QN11h', 'QN11i', 'QN11j', 'QN11k', 'QN11l', 'QN11m', 'QN11n', 'QN12a', 'QN12b', 'QN12c', 'QN12d', 'QN12e', 'QN12f', 'QN12g', 'QN12h', 'QN12i', 'QN12j', 'QN12k', 'QN12l', 'QN12m', 'QN12n', 'QN13', 'QN14a', 'QN14b', 'QN14c', 'QN14d', 'QN14e', 'QN14f', 'QN14g', 'QN14h', 'QN14i', 'QN14j', 'QN14k', 'QN14l', 'QN14m', 'QN14n', 'QN14o', 'QN14p', 'QN15', 'QN16', 'QN17', 'QN18a_a', 'QN18a_b', 'QN18a_c', 'QN18a_d', 'QN18a_e', 'QN18a_f', 'QN18a_g', 'QN18a_h', 'QN18a_i', 'QN18a_j', 'QN18a_k', 'QN18b_a', 'QN18b_b', 'QN18b_c', 'QN18b_d', 'QN18b_e', 'QN18b_f', 'QN18b_g', 'QN18b_h', 'QN18b_i', 'QN18b_j', 'QN18c_a', 'QN18c_b', 'QN18c_c', 'QN18c_d', 'QN18c_e', 'QN18c_f', 'QN18c_g', 'QN18c_h', 'QN18c_i', 'QN18c_j', 'QN18d_a', 'QN18d_b', 'QN18d_c', 'QN18d_d', '

# Data Preprocessing

In [42]:
# Inspect initial column amount
print('Number of columns: ' + str(len(df.columns)))



Number of columns: 1469


In [43]:
# Remove all columns with text values, or irrelevant columns ex. Location

# Remove columns that don't start with 'Q'
df = df.filter(regex='^Q')

# Remove columns that contain 'TEXT', which denotes a textual response
df = df.filter(regex='^(?!.*TEXT)')

print("Number of columns after removal: " + str(len(df.columns)))

Number of columns after removal: 1376


<u>Note</u>: Column inspection revealed that columns Q1-Q149 are raw survey choice options where some values are non numerical,
such as N = Not Answered, Z = Not Displayed, or E = Missing. Columns QN1-QN149 are numerical encoded, where missing values are encoded with NaN. Moving forward, all 'Q' columns will be removed, and only 'QN' columns will be used.

In [44]:
# Create 'QN' and 'Q' DataFrames

# DataFrame with columns that start with 'QN' only
df_qn = df.filter(regex='^QN')

# DataFrame with columns that start with 'Q' but not 'QN'
df_q = df.filter(regex='^Q(?!N)')

In [45]:
# Inspect 'QN' DataFrame
df_qn.head(30)

Unnamed: 0,QN1,QN2,QN3,QN4a,QN4b,QN4c,QN4d,QN4e,QN5a,QN5b,...,QN142,QN143,QN144,QN145,QN146,QN147,QN148,QN149,QN141R,QN142R
0,5.0,2.0,2.0,1.0,,,,,,,...,,,,,,,,,,
1,8.0,2.0,1.0,,,,,,,,...,,,,,,,,,,
2,5.0,1.0,2.0,,1.0,,,,,,...,,,,,,,,,,
3,4.0,2.0,2.0,1.0,,,,,,,...,,,,,,,,,,
4,4.0,2.0,1.0,1.0,,,,,,,...,1.0,1.0,2.0,4.0,4.0,3.0,4.0,1.0,1.0,1.0
5,6.0,2.0,3.0,1.0,,,,,,,...,1.0,1.0,2.0,3.0,2.0,4.0,5.0,1.0,3.0,1.0
6,9.0,1.0,6.0,,1.0,,,,,,...,,,,,,,,,,
7,1.0,2.0,7.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
8,10.0,1.0,7.0,1.0,,,,,1.0,,...,2.0,1.0,2.0,3.0,3.0,3.0,3.0,1.0,7.0,2.0
9,9.0,1.0,6.0,1.0,,,,,,,...,2.0,1.0,1.0,1.0,1.0,1.0,7.0,2.0,1.0,2.0


In [46]:
# Inspect 'Q' DataFrame
# Inspection of both DataFrames show how Q is only raw survey choice values, whereas QN are numerically encoded.
df_q.head(10)

Unnamed: 0,Q1,Q2,Q3,Q4a,Q4b,Q4c,Q4d,Q4e,Q5a,Q5b,...,Q140,Q141,Q142,Q143,Q144,Q145,Q146,Q147,Q148,Q149
0,5,2,2,1,,,,,,,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
1,8,2,1,N,N,N,N,N,,,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
2,5,1,2,,1,,,,,,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
3,4,2,2,1,,,,,,,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
4,4,2,1,1,,,,,,,...,2,1,1,1,2,4,4,3,4,1
5,6,2,3,1,,,,,,,...,2,3,1,1,2,3,2,4,5,1
6,9,1,6,,1,,,,,,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
7,1,2,7,,1,1,1,1,1.0,1.0,...,N,N,N,N,N,N,N,N,N,N
8,10,1,7,1,,,,,1.0,,...,2,7,2,1,2,3,3,3,3,1
9,9,1,6,1,,,,,,,...,1,1,2,1,1,1,1,1,7,2


In [47]:
# List all QN columns
print(list(df_qn.columns))

['QN1', 'QN2', 'QN3', 'QN4a', 'QN4b', 'QN4c', 'QN4d', 'QN4e', 'QN5a', 'QN5b', 'QN5c', 'QN5d', 'QN5e', 'QN6', 'QN7', 'QN8', 'QN9', 'QN10', 'QN11a', 'QN11b', 'QN11c', 'QN11d', 'QN11e', 'QN11f', 'QN11g', 'QN11h', 'QN11i', 'QN11j', 'QN11k', 'QN11l', 'QN11m', 'QN11n', 'QN12a', 'QN12b', 'QN12c', 'QN12d', 'QN12e', 'QN12f', 'QN12g', 'QN12h', 'QN12i', 'QN12j', 'QN12k', 'QN12l', 'QN12m', 'QN12n', 'QN13', 'QN14a', 'QN14b', 'QN14c', 'QN14d', 'QN14e', 'QN14f', 'QN14g', 'QN14h', 'QN14i', 'QN14j', 'QN14k', 'QN14l', 'QN14m', 'QN14n', 'QN14o', 'QN14p', 'QN15', 'QN16', 'QN17', 'QN18a_a', 'QN18a_b', 'QN18a_c', 'QN18a_d', 'QN18a_e', 'QN18a_f', 'QN18a_g', 'QN18a_h', 'QN18a_i', 'QN18a_j', 'QN18a_k', 'QN18b_a', 'QN18b_b', 'QN18b_c', 'QN18b_d', 'QN18b_e', 'QN18b_f', 'QN18b_g', 'QN18b_h', 'QN18b_i', 'QN18b_j', 'QN18c_a', 'QN18c_b', 'QN18c_c', 'QN18c_d', 'QN18c_e', 'QN18c_f', 'QN18c_g', 'QN18c_h', 'QN18c_i', 'QN18c_j', 'QN18d_a', 'QN18d_b', 'QN18d_c', 'QN18d_d', 'QN18d_e', 'QN18d_f', 'QN18d_g', 'QN18d_h', 'QN18

<u>Note</u>: Moving forward, the DataFrame `df_qn` will be saved into `df`.

In [48]:
# Update DataFrame to only have QN columns
df = df_qn

<u>Types of Questions that need to be dealt with</u>:  
- Ordinal Categorical Features: Ordered responses, representing levels or intensity, such as "Not at all true" to "Very true."
- Nominal (Multiclass) Categorical Features: Categories with distinct, non-ordered options, like race, ethnicity, or gender.
- Binary Categorical Features: Yes/No questions that capture simple two-option responses, e.g., "Have you ever used an e-cigarette?"
- Count Features: Numeric counts of occurrences within a timeframe, such as "How many days did you use e-cigarettes in the last 30 days?"
- Frequency-Based Categorical Features: Responses representing frequencies, often in increasing order, like "Nearly every day" to "Not at all."
- Multiple-Response Categorical Features: Options where respondents can select multiple true/false answers, such as reasons for using e-cigarettes.  
  
To solve this, a combination of methods will be used:
- Grouping or Binning Categorical Features into new columns
- One Hot Encoding
- Removing Irrelevant Questions ( Feature Selection will be last )

To inspect the questions in a more convenient way, a mapping of each column to their corresponding question from the codebook was created in the `map.py` file.

In [49]:
# Import annotated map
from map import questions as map

Next, before training a model on full feature set, then eventually a reduced feature set, categorical data and missing values will need to be dealt with.

In [52]:
df.head(10)

Unnamed: 0,QN1,QN2,QN3,QN4a,QN4b,QN4c,QN4d,QN4e,QN5a,QN5b,...,QN142,QN143,QN144,QN145,QN146,QN147,QN148,QN149,QN141R,QN142R
0,5.0,2.0,2.0,1.0,,,,,,,...,,,,,,,,,,
1,8.0,2.0,1.0,,,,,,,,...,,,,,,,,,,
2,5.0,1.0,2.0,,1.0,,,,,,...,,,,,,,,,,
3,4.0,2.0,2.0,1.0,,,,,,,...,,,,,,,,,,
4,4.0,2.0,1.0,1.0,,,,,,,...,1.0,1.0,2.0,4.0,4.0,3.0,4.0,1.0,1.0,1.0
5,6.0,2.0,3.0,1.0,,,,,,,...,1.0,1.0,2.0,3.0,2.0,4.0,5.0,1.0,3.0,1.0
6,9.0,1.0,6.0,,1.0,,,,,,...,,,,,,,,,,
7,1.0,2.0,7.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
8,10.0,1.0,7.0,1.0,,,,,1.0,,...,2.0,1.0,2.0,3.0,3.0,3.0,3.0,1.0,7.0,2.0
9,9.0,1.0,6.0,1.0,,,,,,,...,2.0,1.0,1.0,1.0,1.0,1.0,7.0,2.0,1.0,2.0


In [54]:
# Inspect DataFrame for categorical data
df.info()
print("Number of features: " + str(len(df.columns)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22069 entries, 0 to 22068
Columns: 689 entries, QN1 to QN142R
dtypes: float64(689)
memory usage: 116.0 MB
Number of features: 689


Based on inspection, it has been verified that all questions are numerical data type (float), all text data was removed successfully, and all questions that have multiple choices are split into separate questions. However, questions with count / frequency data, such as "QN8: In total, on how many days have you used e-cigarettes in your entire life" need to be dealt with, as the goal is to have all columns have 0 or 1 values. Missing data also needs to be dealt with in the form of column drop or imputation, as models we intend to use such as Logistic Regression and Random forest cannot handle NaN values natively.

# Dealing With Missing Data

- Binary Questions (yes or no): Replace with a 0.
- Skip Logic Questions (if a previous question caused a participant to skip the current question): Replace with 0
  - Although 0 implies absence of a behavior, and NaN implies "not applicable" for the purpose of the research NaN will be imputed with 0.
- Ordinal Features (how true is ___ from 1-4): Replace with median.
- Nominal / Multiclass (non-ordered questions such as gender or race): 
  - Questions such as "QN141: Which of these options best describe your sexual orientation"