In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import stanza
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict


In [2]:
# Load the Sentiment140 dataset (replace the path with your actual file path)
df = pd.read_csv('sentiment140.csv', encoding='ISO-8859-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Preview the dataset
df.head()


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
import re
from nltk.tokenize import word_tokenize

# Basic preprocessing function
def preprocess(text):
    # Remove links, user handles, and non-alphabetic characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess)

# Tokenize text for word2vec input
df['tokens'] = df['cleaned_text'].apply(word_tokenize)
df[['cleaned_text', 'tokens']].head()


Unnamed: 0,cleaned_text,tokens
0,a thats a bummer you shoulda got david car...,"[a, thats, a, bummer, you, shoulda, got, david..."
1,is upset that he cant update his facebook by t...,"[is, upset, that, he, cant, update, his, faceb..."
2,i dived many times for the ball managed to sa...,"[i, dived, many, times, for, the, ball, manage..."
3,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,no its not behaving at all im mad why am i he...,"[no, its, not, behaving, at, all, im, mad, why..."


In [4]:
# Train Word2Vec model
sentences = df['tokens'].values.tolist()  # List of tokenized sentences
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the model for later use
model.save("word2vec_sentiment140.model")

# Check the vector for a specific word
model.wv['happy']


array([ 1.1380569e+00,  1.6798391e+00,  5.3490782e-01, -1.3220973e+00,
        1.5473249e+00,  2.8470736e+00, -1.0451984e+00,  5.1259956e+00,
       -8.8799024e-01,  1.8172508e+00, -1.3332095e+00,  5.9322685e-01,
       -9.4213879e-01,  1.8080770e+00, -1.2684439e+00,  9.3826538e-01,
        1.5614133e+00,  1.1706752e+00, -4.5590761e-01,  1.4284602e+00,
        2.9882237e-01,  2.5660779e+00, -1.4944427e+00,  6.7218417e-01,
       -6.2280458e-01,  3.5854176e-01, -1.0778033e+00, -5.8193398e-01,
       -4.7605243e+00, -2.0378101e+00,  7.4142665e-01,  1.2382221e+00,
        2.6300892e-01,  1.7938393e+00,  4.2192297e+00, -8.1224136e-02,
        8.7886113e-01,  2.3239298e+00, -2.8447332e+00,  8.4191501e-01,
        1.1227735e+00, -1.0532027e-01, -1.8163227e+00, -6.3161486e-01,
       -3.2177576e-01, -6.6026938e-01, -8.0751383e-01,  4.4102979e+00,
       -1.9376391e+00,  1.1603693e-02,  1.4293714e+00,  3.2226915e+00,
       -1.5676018e+00, -1.8569261e-01, -2.4214728e+00,  2.7329717e+00,
      

In [5]:
def sentence_embedding(tokens, model):
    # Create a vector by averaging word vectors for each token in the sentence
    vector = np.zeros(100)  # Word2Vec vector size is 100
    count = 0
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count > 0:
        vector /= count  # Normalize the vector by dividing by the number of words
    return vector

# Apply sentence embedding to all sentences in the dataset
df['sentence_embedding'] = df['tokens'].apply(lambda tokens: sentence_embedding(tokens, model))

# Check the first sentence embedding
df['sentence_embedding'].head()


0    [-0.04507019370794296, 0.303500477806665, 0.48...
1    [-0.14101133531048185, 0.7837200789224534, 0.1...
2    [0.17004649911541492, -0.8581965663870506, -0....
3    [-0.6621622829232365, -0.3362919680774212, -0....
4    [-0.10064557606820017, 0.15758222572039812, 0....
Name: sentence_embedding, dtype: object

In [None]:

# Download and initialize Stanza for dependency parsing
stanza.download('en')  # Download English model for Stanza
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse')

# Function to parse the sentence and create a dependency graph
def parse_dependencies(sentence):
    doc = nlp(sentence)
    dep_graph = defaultdict(list)
    for sent in doc.sentences:
        for word in sent.words:
            dep_graph[word.id].append((word.text, word.head))
    return dep_graph

# Apply the dependency parsing to the cleaned text
df['dependencies'] = df['cleaned_text'].apply(parse_dependencies)

# Visualize the dependencies of a sentence
def visualize_dependency_graph(dep_graph):
    G = nx.DiGraph()
    for word_id, (word, head) in dep_graph.items():
        G.add_edge(head, word_id, label=word)
    
    # Plot the graph
    nx.draw(G, with_labels=True, font_weight='bold', node_color='skyblue', font_size=10, node_size=2000)
    plt.show()

# Example for the first sentence
import networkx as nx
visualize_dependency_graph(df['dependencies'][0])


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-15 15:21:18 INFO: Downloaded file to C:\Users\PC\stanza_resources\resources.json
2024-11-15 15:21:18 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.9.0/models/default.zip:   0%|          | 0…

2024-11-15 15:26:32 INFO: Downloaded file to C:\Users\PC\stanza_resources\en\default.zip
2024-11-15 15:26:37 INFO: Finished downloading models and saved to C:\Users\PC\stanza_resources
2024-11-15 15:26:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-15 15:26:38 INFO: Downloaded file to C:\Users\PC\stanza_resources\resources.json
2024-11-15 15:26:38 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2024-11-15 15:26:38 INFO: Using device: cpu
2024-11-15 15:26:38 INFO: Loading: tokenize
2024-11-15 15:26:38 INFO: Loading: mwt
2024-11-15 15:26:38 INFO: Loading: pos
2024-11-15 15:26:39 INFO: Loading: lemma
2024-11-15 15:26:39 INFO: Loading: depparse
2024-11-15 15:26:39 INFO: Done loading processors!
