In [26]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load the dataset
df = pd.read_csv(r'C:\Users\ki_shari\Downloads\DFF.csv', encoding='latin-1')
df.loc[df["LABEL"] == "__label1__", "LABEL"] = 1
df.loc[df["LABEL"] == "__label2__", "LABEL"] = 0
df['LABEL']=pd.to_numeric(df['LABEL'])

# Convert the 'incon' column to numerical values
incon_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['incon_encoded'] = df['incon'].map(incon_mapping)

# Step 2: Preprocess the text data
stopwords_set = set(stopwords.words('english'))  # Set of English stopwords

preprocessed_reviews = []
for review in df['ORIGINAL_TEXT'].astype(str):
    tokens = word_tokenize(review.lower())  # Convert to lowercase and tokenize
    filtered_tokens = [token for token in tokens if token not in stopwords_set]  # Remove stopwords
    preprocessed_reviews.append(filtered_tokens)

# Step 3: Tagging and Training Doc2Vec Model
tagged_reviews = [TaggedDocument(words=review, tags=[str(i)]) for i, review in enumerate(preprocessed_reviews)]
doc2vec_model = Doc2Vec(tagged_reviews, vector_size=300, window=5, min_count=1, epochs=10)

# Step 4: Generate sentence-level embeddings
sentence_embeddings = []
for i in range(len(tagged_reviews)):
    vector = doc2vec_model.infer_vector(tagged_reviews[i].words)
    sentence_embeddings.append(vector)

# Step 5: Normalize the incon_encoded metric
scaler = MinMaxScaler()
incon_encoded = df['incon_encoded'].values.reshape(-1, 1)
incon_encoded_normalized = scaler.fit_transform(incon_encoded)

# Step 6: Concatenate embeddings with incon_encoded metric
combined_features = np.concatenate((np.array(sentence_embeddings), incon_encoded_normalized), axis=1)

In [33]:
# Assuming you have already created the combined_features matrix

# Get the shape of combined_features
combined_features_shape = combined_features.shape

# Print the shape of combined_features
print("Shape of combined_features:", combined_features_shape)

# Get the size of combined_features
combined_features_size = combined_features.size

# Print the size of combined_features
print("Size of combined_features:", combined_features_size)

Shape of combined_features: (21000, 301)
Size of combined_features: 6321000


In [37]:
specific_review_index = 10  # Specify the index of the specific review

specific_review_features = combined_features[specific_review_index]

print(f"Combined Features for Review {combined_features[specific_review_index] + 1}:")
print(specific_review_features)

Combined Features for Review [0.93263407 1.00622619 1.01456026 0.98808938 0.97094306 0.9778636
 1.02219428 1.06817468 1.04107656 0.97839567 1.02312811 0.99245256
 1.02340375 0.97771799 0.9643677  0.93320732 1.00964361 0.99075977
 1.02047344 1.03367715 1.02390415 0.97248936 1.01990118 0.98367599
 0.98808324 0.98822951 1.02247813 0.95989533 1.02912391 1.02188599
 0.99614804 1.007819   1.00603729 0.9868879  1.04010818 0.95602306
 0.92789603 0.91341232 0.99853017 1.00121182 0.99198849 0.99865434
 0.98061278 0.95022453 1.03453253 1.00866741 1.01077345 0.98851695
 0.98224237 1.02661802 1.0211389  0.96694918 0.9930325  0.9885209
 0.95199143 1.01349955 0.98980879 0.97648463 1.00189664 0.99962503
 0.95863495 1.0141521  0.96431539 1.02137691 1.04931102 1.03920306
 0.9411601  0.9895195  0.98413032 1.01848709 0.95568374 1.02379183
 1.03844363 0.98871503 1.02606482 0.97774366 1.0282464  1.00627112
 0.97875307 1.02415864 0.97478326 0.92552216 0.96986284 1.07343934
 0.94544268 0.97014385 0.96099433 1