In [11]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

In [12]:
# Example text
text = """
Natural language processing (NLP) is a subfield of artificial intelligence (AI) 
that focuses on the interaction between computers and humans through natural language. 
It involves the development of algorithms and models to understand, interpret, and 
generate human-like language.

Tokenization is an important step in NLP. It involves breaking down text into individual words or tokens. 
After tokenization, lemmatization can be applied to reduce words to their base form. 
Stopwords, common words that don't carry much meaning, are often removed during preprocessing.

NLP techniques, such as one-hot encoding, bag-of-words, and TF-IDF, help convert text data into a format 
that can be used by machine learning models. Word embeddings, such as Word2Vec and GloVe, provide 
dense vector representations for words, capturing semantic relationships.

Understanding these techniques is crucial for working with text data and building effective NLP applications.
"""

In [13]:
# Tokenization
tokens = word_tokenize(text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [token.lower() for token in lemmatized_tokens if token.lower() not in stop_words and token not in punctuation]




In [14]:
# Train Word2Vec model
word2vec_model = Word2Vec([filtered_tokens], vector_size=100, window=5, min_count=1, workers=4)

# Print the word vectors
print("Word Vectors:")
for word in word2vec_model.wv.index_to_key:
    print(f"{word}: {word2vec_model.wv[word]}")


Word Vectors:
word: [-5.3863111e-04  2.4342992e-04  5.1164879e-03  9.0150675e-03
 -9.2869438e-03 -7.1573094e-03  6.4904303e-03  9.0054767e-03
 -5.0501847e-03 -3.7943488e-03  7.3691290e-03 -1.5920434e-03
 -4.5386553e-03  6.5554762e-03 -4.8641963e-03 -1.8093452e-03
  2.8603615e-03  9.7123737e-04 -8.2983226e-03 -9.4861565e-03
  7.3203840e-03  5.1041702e-03  6.7721377e-03  7.3675124e-04
  6.3398988e-03 -3.3800432e-03 -9.8302064e-04  5.7503302e-03
 -7.5171781e-03 -3.9591910e-03 -7.4910563e-03 -9.1501547e-04
  9.5536271e-03 -7.3308740e-03 -2.3547283e-03 -1.9207697e-03
  8.0926614e-03 -5.9174099e-03  2.8853961e-05 -4.8257257e-03
 -9.5912553e-03  4.9837041e-03 -8.7597836e-03 -4.3763611e-03
 -2.1486843e-05 -3.0095453e-04 -7.6872390e-03  9.5762536e-03
  4.9862294e-03  9.2434846e-03 -8.1629027e-03  4.4634617e-03
 -4.1359505e-03  7.9526554e-04  8.4806224e-03 -4.4926275e-03
  4.5343940e-03 -6.8043373e-03 -3.5785816e-03  9.3938690e-03
 -1.5590925e-03  3.3007294e-04 -4.1041197e-03 -7.6837386e-03
 -1.

# woord2 vec practical proj 


In [37]:
import pandas as pd

In [38]:
df = pd.read_csv('movie_reviews_dataset.csv')

In [39]:
df

Unnamed: 0,text,review
0,Poor acting and a confusing storyline.,0
1,A brilliant and thought-provoking piece of cin...,1
2,A forgettable experience that I wouldn't recom...,0
3,A visually stunning masterpiece that deserves ...,1
4,An emotional journey that resonated with me de...,1
...,...,...
205,I found the movie to be uplifting and inspiring.,1
206,Great acting and a compelling plot.,1
207,"The visuals were mediocre, and the cinematogra...",0
208,I was left feeling unsatisfied and disappointe...,0


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from string import punctuation    

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler





In [41]:
# Text Preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token.lower() for token in lemmatized_tokens if token.lower() not in stop_words and token not in punctuation]
 
    
    return filtered_tokens

# Apply text preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

In [42]:
# Train Word2Vec model
word2vec_model = Word2Vec(df['processed_text'], vector_size=100, window=5, min_count=1, workers=4)

In [43]:
# Function to calculate the average vector for a document
def document_vector(tokens, model):
    vector = [model.wv[token] for token in tokens if token in model.wv]
    return sum(vector) / len(vector) if vector else None

# Create a column with Word2Vec vectors for each document
df['word2vec_vector'] = df['processed_text'].apply(lambda x: document_vector(x, word2vec_model))


In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['word2vec_vector'].dropna(), df['review'], test_size=0.2, random_state=42)



In [45]:
# Train a machine learning model (e.g., Logistic Regression)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(list(X_train), y_train)

# Predictions on the test set
y_pred = model.predict(list(X_test))

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred) )

Accuracy: 0.42857142857142855
Classification Report:
               precision    recall  f1-score   support

           0       0.43      1.00      0.60        18
           1       0.00      0.00      0.00        24

    accuracy                           0.43        42
   macro avg       0.21      0.50      0.30        42
weighted avg       0.18      0.43      0.26        42



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
new_review = "This movie was not absolutely amazing! The acting and storyline were to bad ."
processed_new_review = preprocess_text(new_review)
new_review_vector = document_vector(processed_new_review, word2vec_model)

In [47]:
if new_review_vector is not None:
    prediction = model.predict([new_review_vector])[0]
    print("\nNew Review:")
    print(new_review)
    print("\nPredicted Sentiment:")
    if prediction == 1:
        print("Positive")
    else:
        print("Negative")
else:
    print("Words in the new review are not present in the Word2Vec model vocabulary.")


New Review:
This movie was not absolutely amazing! The acting and storyline were to bad .

Predicted Sentiment:
Negative
