In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import warnings
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim
from gensim.models import Word2Vec

In [2]:
# Suppress warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Load car data
train_raw_df = pd.read_csv(r"C:\Users\hrish\OneDrive\Desktop\cardataset.csv")

In [4]:
train_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

### Preprocessing

In [5]:
# Drop null values and create a new column combining multiple columns
train_raw_df.dropna(inplace=True)
train_raw_df.reset_index(drop=True, inplace=True)
train_raw_df["train_text"] = train_raw_df[['Market Category', 'Vehicle Size', 'Vehicle Style']].apply(' '.join, axis=1)

In [6]:
x_train = train_raw_df["train_text"]
y_train = train_raw_df.MSRP

In [7]:
# Remove stopwords, and perform stemming
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [8]:
doc = " ".join(x_train)

In [9]:
count_vec = CountVectorizer()
count_occurs = count_vec.fit_transform([doc])
count_occur_df = pd.DataFrame((count, word) for word, count in zip(count_occurs.toarray().tolist()[0], count_vec.get_feature_names_out()))
count_occur_df.columns = ['Word', 'Count']
count_occur_df.sort_values('Count', ascending=False, inplace=True)
count_occur_df.head()

Unnamed: 0,Word,Count
23,performance,3456
19,luxury,3279
20,midsize,3187
4,compact,3039
1,4dr,2771


In [10]:
norm_count_vec = TfidfVectorizer(use_idf=False, norm='l2')
norm_count_occurs = norm_count_vec.fit_transform([doc])
norm_count_occur_df = pd.DataFrame((count, word) for word, count in zip(
    norm_count_occurs.toarray().tolist()[0], norm_count_vec.get_feature_names_out()))
norm_count_occur_df.columns = ['Word', 'Count']
norm_count_occur_df.sort_values('Count', ascending=False, inplace=True)
norm_count_occur_df.head()

Unnamed: 0,Word,Count
23,performance,0.38667
19,luxury,0.366867
20,midsize,0.356573
4,compact,0.340015
1,4dr,0.31003


In [11]:
# Bag-of-Words (BoW) approach
count_vec = CountVectorizer()
count_occurs = count_vec.fit_transform(train_raw_df['train_text'])
bow_feature_names = count_vec.get_feature_names_out()

In [12]:
tfidf_vec = TfidfVectorizer()
tfidf_count_occurs = tfidf_vec.fit_transform([doc])
tfidf_count_occur_df = pd.DataFrame((count, word) for word, count in zip(
    tfidf_count_occurs.toarray().tolist()[0], tfidf_vec.get_feature_names_out()))
tfidf_count_occur_df.columns = ['Word', 'Count']
tfidf_count_occur_df.sort_values('Count', ascending=False, inplace=True)
tfidf_count_occur_df.head()

Unnamed: 0,Word,Count
23,performance,0.38667
19,luxury,0.366867
20,midsize,0.356573
4,compact,0.340015
1,4dr,0.31003


### Word2Vec Embeddings

In [13]:
# Tokenize the text data
data = []
for text in train_raw_df["train_text"]:
    tokens = word_tokenize(text.lower())
    data.append(tokens)

In [14]:
# Create Word2Vec model
word2vec_model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)

In [15]:
vocab = word2vec_model.wv.index_to_key
print(vocab)

[',', 'luxury', 'midsize', 'compact', '4dr', 'hatchback', 'suv', 'performance', 'crossover', 'sedan', 'large', 'high-performance', 'flex', 'fuel', 'coupe', 'cab', 'pickup', 'convertible', 'tuner', 'factory', '2dr', 'exotic', 'wagon', 'crew', 'hybrid', 'diesel', 'extended', 'regular', 'passenger', 'van', 'minivan', 'cargo']


In [16]:
# Example of accessing Word2Vec embeddings

word_embedding_hatchback = word2vec_model.wv['luxury']
word_embedding_sedan = word2vec_model.wv['sedan']

print("Word Embedding for 'sedan':", word_embedding_sedan)
print("Word Embedding for 'hatchback':", word_embedding_hatchback)


Word Embedding for 'sedan': [ 0.22187115  0.16104817  0.05745517  0.20001909  0.0397713  -0.20779292
  0.19662224  0.28775868 -0.13516489 -0.34410304  0.1936323  -0.08055928
  0.11132812 -0.01060597  0.13975409  0.03598341  0.25343108 -0.02690472
 -0.12069616 -0.19445577  0.05991372 -0.10372454 -0.06741315 -0.27864054
  0.01018045 -0.070884   -0.12606104  0.19368544  0.06050675  0.04160665
 -0.03807113 -0.11567736 -0.02542154 -0.16745104 -0.10655665  0.05398561
  0.13367105  0.05900685  0.04471144 -0.01069998  0.05160953  0.04140455
  0.0368476   0.10506568  0.01910705  0.08788014 -0.02926354  0.06057111
  0.07835005  0.22645216  0.01551444  0.03482153 -0.09262227 -0.11986373
  0.02735724 -0.03825155  0.04862401  0.10349592  0.03706342  0.07091626
 -0.1099636   0.08189918  0.04410104 -0.00217086 -0.03582961  0.30394608
  0.0747623   0.09946655 -0.12464805 -0.11016831  0.08104701  0.0156034
  0.17401803  0.01286027 -0.00375556 -0.02546848  0.11526553  0.15423381
  0.02495185 -0.09023719

### Print Word2Vec embeddings

In [19]:
word_pairs = [('luxury', 'performance'), ('crossover', 'midsize')]

# Create Word2Vec model
word2vec_model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)

# Get word embedding for the word 'car'
word_embedding = word2vec_model.wv['performance']

In [20]:
# Create CBOW model
cbow_model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)

In [21]:
print("Cosine similarity between word pairs (CBOW):")
for pair in word_pairs:
    similarity = cbow_model.wv.similarity(pair[0], pair[1])
    print(f"{pair[0]} vs {pair[1]}: {similarity}")

Cosine similarity between word pairs (CBOW):
luxury vs performance: 0.931273341178894
crossover vs midsize: 0.9255887866020203


In [22]:
# Create Skip Gram model
skipgram_model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)

In [23]:
print("\nCosine similarity between word pairs (Skip Gram):")
for pair in word_pairs:
    similarity = skipgram_model.wv.similarity(pair[0], pair[1])
    print(f"{pair[0]} vs {pair[1]}: {similarity}")


Cosine similarity between word pairs (Skip Gram):
luxury vs performance: 0.935371994972229
crossover vs midsize: 0.8659140467643738


In [24]:
# Print results or further processing as needed
print("Bag-of-Words Matrix Shape:", count_occurs.shape)
print("TF-IDF Matrix Shape:", tfidf_count_occurs.shape)
print("Word Embedding for 'performance':", word_embedding)

Bag-of-Words Matrix Shape: (8084, 31)
TF-IDF Matrix Shape: (1, 31)
Word Embedding for 'performance': [ 0.15209259  0.23022792  0.0689415   0.15205634 -0.00855451 -0.1530008
  0.23197587  0.27179253 -0.11667452 -0.34279728  0.21484509 -0.00377729
  0.08853536 -0.09620951  0.14410843 -0.06543184  0.12244847  0.04073054
 -0.25889403 -0.13370629 -0.06372132 -0.11119022 -0.00532859 -0.13249247
 -0.11316031 -0.10435013  0.03717854  0.07193767 -0.01127891  0.06483596
 -0.06852204 -0.21037386 -0.06454483 -0.18923186 -0.04803101  0.13725972
  0.17189404  0.05568149  0.03045202 -0.00677226  0.10611449 -0.01877552
 -0.05495764  0.14235775  0.08138464  0.02625509  0.01014959  0.05879445
  0.05551432  0.13927393  0.03889471 -0.05025378 -0.03333646 -0.16748255
 -0.0836678   0.0221058   0.06075554  0.04392875 -0.04927633 -0.04482803
  0.00941699  0.09691627  0.07291638 -0.1244024  -0.00956965  0.29654428
  0.05779937  0.16467495 -0.1664528  -0.19730972  0.07610791  0.01999597
  0.16658075  0.127106  

## -------------------------------------------------

In [25]:
print("Performed by: Hrishikesh Bari || Roll No: 68")

Performed by: Hrishikesh Bari || Roll No: 68
