In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Survey question example for qualitative analysis.

"What are your opinions on the health impacts of traditional smoking versus vaping?"

In [None]:
!pip install nltk
!pip install openai pandas



In [None]:
!pip install gensim
from gensim.models import KeyedVectors



In [None]:
import re
import nltk
import spacy
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
nltk.download('punkt')

# load spacy pre traning model
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    text = text.lower()  # transform to lower letter
    text = re.sub(r'\d+', '', text)  # delete number
    text = re.sub(r'[^\w\s]', '', text)  # delete punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # delete extra parathesis
    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


I generate the train data and label it into four categories:

0: Negative effects of traditional smoking
1: Negative effects of e-cigarettes
2: Positive effects of e-cigarettes
3: Non-committal (or don't know)

In [None]:
import pandas as pd

# text and its corresponding label
data = [
    ["Traditional smoking is very harmful to health.", 0],
    ["Vaping is considered less harmful than traditional smoking.", 2],
    ["I think vaping is still bad for health.", 1],
    ["Smoking can cause lung cancer and other serious diseases.", 0],
    ["Some believe vaping is a safer alternative to smoking.", 2],
    ["Both smoking and vaping are harmful in different ways.", 3],
    ["I find that vaping helps me reduce smoking.", 2],
    ["Traditional smoking is addictive and very dangerous.", 0],
    ["Vaping flavors make it more appealing, but it's not risk-free.", 1],
    ["I quit smoking thanks to vaping, and I feel healthier.", 2],
    ["Smoking has a long history of health risks.", 0],
    ["The long-term health impacts of vaping are still unknown.", 1],
    ["Vaping has helped many people quit smoking, but it's not entirely safe.", 2],
    ["Smoking damages your lungs severely.", 0],
    ["Vaping is seen as a lesser evil compared to smoking.", 2],
    ["There is a lot of debate about vaping being safer than smoking.", 3],
    ["I believe vaping is a good way to quit traditional smoking.", 2],
    ["Smoking is more harmful than vaping according to some studies.", 1],
    ["Vaping can still lead to nicotine addiction.", 1],
    ["Many people use vaping to help them stop smoking.", 2],
    ["Smoking in public places should be banned.", 0],
    ["Vaping might cause health problems that we don't know about yet.", 1],
    ["Some people think vaping is a gateway to smoking.", 3],
    ["There is no safe level of smoking.", 0],
    ["Electronic cigarettes are not risk-free.", 1],
    ["I find it hard to quit smoking, even with vaping.", 0],
    ["Vaping can be an effective tool to quit smoking.", 2],
    ["The impact of vaping on youth is concerning.", 1],
    ["Vaping should be regulated like traditional cigarettes.", 3],
    ["Some studies suggest vaping is less harmful, but more research is needed.", 3],
    ["Smoking while pregnant can harm the baby.", 0],
    ["Vaping devices sometimes explode, causing injuries.", 1],
    ["Second-hand smoke from traditional cigarettes is dangerous.", 0],
    ["Vaping can lead to dry mouth and throat irritation.", 1],
    ["Quitting smoking improves lung function.", 2],
    ["Vaping in enclosed spaces should be restricted.", 3],
    ["Smoking causes yellow teeth and bad breath.", 0],
    ["Vaping has not been proven to be effective in the long term.", 1],
    ["Parents should discourage their children from vaping.", 1],
    ["Smoking-related diseases are a major public health concern.", 0],
    ["The flavors in vaping products attract young people.", 1],
    ["Vaping is marketed as a healthier alternative, but we need more evidence.", 3],
    ["Traditional cigarettes have over 7000 chemicals.", 0],
    ["Vaping may help smokers quit, but it's not a perfect solution.", 2],
    ["Smoking is a leading cause of preventable death worldwide.", 0],
    ["There is a lack of regulation on vaping products.", 1],
    ["Nicotine in vaping products is addictive.", 1],
    ["Some believe vaping can cause popcorn lung.", 1],
    ["Vaping can lead to nicotine addiction.", 1],
    ["Smoking causes various types of cancer.", 0],
    ["Vaping is less harmful than smoking, but still risky.", 3],
    ["Many studies show the dangers of smoking.", 0],
    ["Vaping has its own set of health risks.", 1],
    ["Smoking affects not only the smoker but also those around them.", 0],
    ["Vaping should not be considered completely safe.", 1],
    ["Smoking is a difficult habit to break.", 0],
    ["Vaping has helped some quit smoking, though not all.", 2],
    ["Traditional smoking has numerous harmful effects.", 0],
    ["The debate over vaping's safety continues.", 3],
    ["Vaping could potentially be a less harmful option.", 2],
    ["Smoking remains a significant health risk.", 0],
    ["Vaping is not a foolproof method for quitting smoking.", 1],
    ["Traditional cigarettes are highly addictive.", 0],
    ["Vaping might reduce smoking, but with unknown long-term effects.", 3],
    ["Smoking's impact on health is well-documented.", 0],
    ["Vaping's appeal to young people is concerning.", 1],
    ["There is still much to learn about the long-term effects of vaping.", 1],
    ["Smoking has immediate and long-term health consequences.", 0],
    ["Vaping can be a stepping stone to quitting smoking.", 2],
    ["Second-hand vapor from e-cigarettes is also a concern.", 1],
    ["Many view vaping as a lesser evil compared to smoking.", 3],
    ["The risks associated with vaping are still being studied.", 1],
    ["Smoking has been linked to heart disease.", 0],
    ["Vaping's safety profile is not fully understood.", 1],
    ["Traditional smoking can lead to chronic diseases.", 0],
    ["Some people successfully quit smoking using vaping.", 2],
    ["There is a push for stricter regulations on vaping.", 3],
    ["Smoking is harmful in any amount.", 0],
    ["Vaping products vary in quality and safety.", 1],
    ["Nicotine addiction is a common issue with vaping.", 1],
    ["Smoking has a significant negative impact on health.", 0],
    ["Many believe vaping is a safer alternative to smoking.", 2],
    ["The effects of vaping on health need more research.", 1],
    ["Smoking cessation programs often include vaping as an option.", 2],
    ["The harmful effects of smoking are widely known.", 0],
    ["Vaping is not without its risks.", 1],
    ["Smoking can shorten your lifespan.", 0],
    ["Vaping has been beneficial for some smokers.", 2],
    ["There are ongoing studies about vaping's long-term impact.", 3],
    ["Traditional smoking leads to severe health issues.", 0],
    ["Vaping has its share of health concerns.", 1],
    ["The benefits of vaping for quitting smoking are debated.", 3],
    ["Smoking is a leading cause of cancer.", 0],
    ["Vaping may help with smoking cessation.", 2],
    ["The public health impact of vaping is still being assessed.", 1],
    ["Smoking rates have declined with the rise of vaping.", 2],
    ["Vaping is not the perfect solution, but it helps some people quit smoking.", 2],
    ["Vaping is better than smoking", 2],
    ["Smoking has been linked to multiple forms of cancer.", 0],
    ["Traditional cigarettes contain harmful toxins that damage your lungs.",0],
    ["The habit of smoking can lead to severe respiratory issues.",0],
    ["Cigarette smoke contributes to air pollution and harms the environment.",0],
    ["Smoking significantly increases the risk of heart disease and stroke.",0],
    ["Switching to vaping has helped many smokers reduce their tobacco intake.",2],
["E-cigarettes are often seen as a less harmful alternative to smoking.",2],
["Vaping can be an effective tool for those looking to quit smoking.",2],
["Some studies suggest that vaping is less harmful to bystanders compared to smoking.",2],
["Using e-cigarettes has allowed me to quit smoking entirely.",2],
["There are still many debates on whether vaping is safer than smoking.",3],
["It's hard to say if vaping is truly better than traditional smoking.",3],
["Some people believe vaping is just a trend, while others think it has long-term risks.",3],
["The safety of vaping is still being studied, and conclusions vary.",3],
["Opinions are divided on whether e-cigarettes are a good alternative to smoking.",3],
["Vaping can lead to nicotine addiction just like traditional smoking.",1],
["The chemicals in e-cigarettes may cause lung damage over time.",1],
["E-cigarettes have been found to contain toxic substances harmful to health.",1],
["Vaping might cause long-term health issues that are still unknown.",1],
["Some users report throat irritation and coughing after vaping.",1],
]
new_entries = [
    ["Traditional smoking has been linked to various life-threatening diseases such as lung cancer and heart disease.", 0],
    ["Vaping might be less harmful than smoking, but it still contains chemicals that could pose health risks.", 1],
    ["Switching to vaping helped me cut down on traditional smoking, and I feel healthier now.", 2],
    ["There’s still a lot of uncertainty around the long-term health effects of vaping.", 3],
    ["Smoking is known to cause chronic respiratory problems, making it extremely dangerous.", 0],
    ["Vaping could still expose users to harmful substances, even though it’s often advertised as safer.", 1],
    ["Using vaping as a smoking cessation tool has worked for many people, including myself.", 2],
    ["I’m not sure if vaping is truly safer than smoking; more research is needed.", 3],
    ["Second-hand smoke from traditional cigarettes is a serious concern for public health.", 0],
    ["Vaping may contain fewer toxins than smoking, but it’s not without its dangers.", 1],
    ["I switched to vaping to quit smoking and have had fewer cravings since then.", 2],
    ["The scientific community is still divided on whether vaping is a safe alternative to smoking.", 3],
    ["Traditional smoking has been a major contributor to lung diseases for decades.", 0],
    ["Although vaping is marketed as safer, it can still lead to nicotine addiction.", 1],
    ["Vaping helped me quit smoking completely, and I believe it’s a better alternative.", 2],
    ["The debate on vaping versus smoking continues, and I’m not sure which is worse.", 3],
    ["Smoking negatively impacts both the smoker’s health and those exposed to second-hand smoke.", 0],
    ["Vaping liquids contain chemicals that might have adverse health effects.", 1],
    ["Many people have successfully used vaping to quit traditional smoking.", 2],
    ["It’s hard to tell if vaping is a real solution or just another health risk.", 3]
]
data.extend(new_entries)
df = pd.DataFrame(data, columns=['text', 'label'])
generated_texts = [item[0] for item in data]
generated_labels = [item[1] for item in data]

df.to_csv('train_data_qualitative_2024.csv', index=False)

The word_tokenize method splits the input text string into a list of words, including punctuation marks. It splits according to Spaces, punctuation, etc., while keeping punctuation as separate units. For example, "Traditional smoking is very harmful to health." will be broken down into ['Traditional', 'smoking', 'is', 'very', 'harmful', 'to', 'health', '.'].

Stop words are common words that are usually ignored in text processing, such as "is", "and", "the", etc.

In [None]:
cleaned_texts = [clean_text(text) for text in generated_texts]
def tokenize_and_remove_stopwords(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words
tokenized_texts = [tokenize_and_remove_stopwords(text) for text in cleaned_texts]

Because in word_tokenize, not like will be splited into ['not','like']. We need to combine them into one single word ['not_like'] to represent the negate meanning.

In [None]:
# handle the negate words
def handle_negations(text_tokens):
    result = []
    negate = False
    for token in text_tokens:
        if token.lower() in ["not", "no", "never"]:
            negate = True
        elif negate:
            result.append(f"not_{token}")
            negate = False
        else:
            result.append(token)
    return result

negation_handled_texts = [handle_negations(text) for text in tokenized_texts]

# recombine the filter texts
final_texts = [' '.join(text) for text in negation_handled_texts]

GloVe (Global Vectors for Word Representation) is a pre-trained word embedding model that maps words to high-dimensional vector Spaces. The vector representation of each word captures its semantic and contextual information. This vector representation can be used to compare similarities between words.

The GloVe vector for each word captures its semantic information. By calculating the average of all the word vectors in the text, we can get a vector that represents the semantics of the entire text

In [None]:
# Load GloVe word embedding model
glove_file = '/content/drive/MyDrive/glove.6B.100d.txt'
glove_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Calculate the word embedding average for each text
def get_average_word2vec(tokens_list, vector, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    mean = np.mean(vectorized, axis=0)
    return mean

X = np.array([get_average_word2vec(tokens, glove_vectors) for tokens in final_texts])

Logistic regression: it estimates the probability of an event occurring, and it is used in the classification.

In [None]:
# split the data set
X_train, X_test, y_train, y_test = train_test_split(X, generated_labels, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.6428571428571429


K means: it is the process of teaching a computer to use unlabeled, unclassified data and enabling the algorithm to operate on that data without supervision.

In [None]:
from sklearn.cluster import KMeans
from scipy.stats import mode
# Kmeans
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(X_train)

kmeans_y_pred = kmeans.predict(X_test)
y_test = np.array(y_test)
# align the label
labels_aligned = np.zeros_like(kmeans_y_pred)
for i in range(4):
    mask = (kmeans_y_pred == i)
    labels_aligned[mask] = mode(y_test[mask])[0]

print("KMeans Accuracy:", accuracy_score(y_test, labels_aligned))


KMeans Accuracy: 0.42857142857142855


Use the new texts to test our logistic regression model because it performe better than Kmeans and store it into excel.

In [None]:
from operator import ne
new_texts = [
    "Smoking is terrible for your health.",
    "Vaping do not make me better",
    "Traditional smoking is known to cause various health issues.",
    "There is a lot of debate about vaping being safer than smoking.",
    "I already addicted in vaping",
    "Vaping has helped me quit smoking."
]
new_cleaned_texts = [clean_text(text) for text in new_texts]
new_tokenized_texts = [tokenize_and_remove_stopwords(text) for text in new_cleaned_texts]
new_negation_handled_texts = [handle_negations(' '.join(text)) for text in new_tokenized_texts]
new_final_texts = [' '.join(text) for text in new_negation_handled_texts]

new_X = np.array([get_average_word2vec(word_tokenize(text), glove_vectors) for text in new_final_texts])

new_y_pred = model.predict(new_X)

print("Predictions:", new_y_pred)

Predictions: [0 1 0 0 1 2]


In [None]:
df = pd.DataFrame({'Text': new_texts, 'Predicted Label': new_y_pred})
df.to_csv('predictions.csv', index=False)

Create the web for user to input the response to interact my model. The web link is https://03b022a2d9b41bcbee.gradio.live, and it expires in 72 hour after each time execution.



In [None]:
!pip install gradio



In [None]:
!pip install gradio huggingface_hub




In [None]:
import gradio as gr
def process_input(user_input):
    cleaned_text = clean_text(user_input)
    tokenized_text = tokenize_and_remove_stopwords(cleaned_text)
    negation_handled_text = handle_negations(' '.join(tokenized_text))
    final_text = ' '.join(negation_handled_text)

    X = np.array([get_average_word2vec(word_tokenize(final_text), glove_vectors)])
    y_pred = model.predict(X)
    s=' which means '
    if y_pred == 0:
      s += 'Negative effects of traditional smoking versus vaping'
    elif y_pred == 1:
      s += 'Negative effects of e-cigarettes'
    elif y_pred == 2:
      s += 'Positive effects of e-cigarettes'
    elif y_pred == 3:
      s += 'Non-committal (or don\'t know)'
    return f"Predictions: {y_pred}", s

title = "What is your opinion on the health impacts of traditional smoking versus vaping?"
description = "Enter your opinion below and the model will predict the category of the statement."

iface = gr.Interface(
    fn=process_input,
    inputs="text",
    outputs="text",
    title=title,
    description=description,
    theme="default",
    examples=["Smoking is terrible for your health.", "Vaping do not make me better", "Vaping has helped me quit smoking."]
)
iface.launch(share=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a4cc4bdb4e818362ad.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


