# Dimensionality Reduction
### on text data

Dimensionality Reduction, in Python with Scikit-learn

The dataset is part of the [Youth Mental Health Narratives](https://www.drivendata.org/competitions/group/cdc-narratives/) project.

<br>
@Ricardo Almeida

In [None]:
!pip install nltk

In [None]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import warnings

# Suppress FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
_ = nltk.download('stopwords', quiet=True)
_ = nltk.download('wordnet', quiet=True)

In [None]:
text_column = 'NarrativeLE'

RANDOM_SEED = 87987

### Loading dataset

Youth Mental Health Narratives dataset

In [None]:
df = pd.read_csv("data/features_Z140Hep.csv")

In [None]:
df

In [None]:
df = df[[text_column]]

In [None]:
df

### Pre-processing text

In [None]:
df = df.copy()

Applying these transformations:

- Remove punctuation
- Blacklist removal: removing specific words
- Tokenization: splits the text into individual words (tokens)
- Lemmatization: reduces words to their base, dictionary form
- Lowercasing: converting all text to lowercase

In [None]:
df[text_column] = df[text_column].str.lower()

In [None]:
word_blacklist = ['v', 'v1', 'v2', 'xxxx', 'xx']

In [None]:
# pre-process text
def process_text(text):

    # Tokenize the text (split text into words, remove punctuation)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # Remove stopwords
    lemmatizer = WordNetLemmatizer()    
    stop_words_set = set(stopwords.words('english'))
    processed_tokens = [
        lemmatizer.lemmatize(token)  # Reduce words to their base or dictionary form
        for token in tokens
        if token not in stop_words_set and not token.isnumeric()
    ]
    # remove blacklisted words
    processed_tokens = [token for token in processed_tokens if token not in word_blacklist]
    # remove tokens starting with a number or other character (include only alphabetic)
    processed_tokens = [token for token in processed_tokens if token[0].isalpha()]
    
    return processed_tokens

In [None]:
# Apply text preprocessing

df['tokens'] = df[text_column].apply(process_text)

### One-hot encoding

In [None]:
# Flatten list to strings
df['token_str'] = df['tokens'].apply(lambda x: ' '.join(x))

# One-hot encoding
vectorizer = CountVectorizer(binary=True)  # binary=True ensures one-hot encoding
one_hot_encoded = vectorizer.fit_transform(df['token_str'])

# Convert to DataFrame
one_df = pd.DataFrame(one_hot_encoded.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
one_df.head(10)

In [None]:
print(f"Dimensionality\nNr. words = {one_df.shape[1]}")

### Dimensionality reduction 

#### Task #1

*Task*: Apply t-SNE, reducing the data on `one_df` to **2 dimensions** for visualization.

In [None]:
# Apply t-SNE with 2 components

# define the t-SNE model
tsne = ...

# perform the transformation on 'one_df' dataframe
reduced_data = ...

In [None]:
# DataFrame for visualization
reduced_df = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2'])

In [None]:
# Visualization of data, projected into a 2D space
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Dimension 1', y='Dimension 2', data=reduced_df, alpha=0.7, s=10)
plt.title("t-SNE Visualization of One-Hot Encoded Word Occurrences", fontsize=14)
plt.xlabel("Dimension 1", fontsize=12)
plt.ylabel("Dimension 2", fontsize=12)
plt.grid(True)
plt.show()

#### Task #2

*Task*: Apply t-SNE, reducing the data on `one_df` to **3 dimensions** for visualization.

In [None]:
# Apply t-SNE with 3 components

# define the t-SNE model
tsne = ...

# perform the transformation on 'one_df' dataframe
reduced_data = ...

In [None]:
# Create a DataFrame for visualization
reduced_df = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2', 'Dimension 3'])

# Include the original words
reduced_df['words'] = df['tokens']

In [None]:
# Interactive 3D visualization, using Plotly
import plotly.express as px

fig = px.scatter_3d(
    reduced_df,
    x='Dimension 1',
    y='Dimension 2',
    z='Dimension 3',
    hover_name='words',
    title="Interactive 3D t-SNE Visualization",
    opacity=0.7
)

fig.update_layout(
    width=800,  # Set the width
    height=600,  # Set the height
    title_font=dict(size=20),  # Adjust the title font size
)
# Show the plot
fig.show()