In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load customer data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [5]:
# Merge transactions with products to get product information
transaction_details = pd.merge(transactions, products, on='ProductID')

In [6]:
# Aggregate transaction details to get customer profiles
customer_profile = transaction_details.groupby('CustomerID').agg({
    'Category': lambda x: x.value_counts().index[0],  # Most common category  # Total amount spent
    'TransactionID': 'count'  # Total number of transactions
}).reset_index()

In [7]:
# Merge customer profiles with customer data
customer_data = pd.merge(customers, customer_profile, on='CustomerID').reset_index()

# **Feature Engineneering**

In [8]:
# Encode categorical features
customer_data = pd.get_dummies(customer_data, columns=['Region', 'Category'])

In [9]:
# Standardize numerical features
scaler = StandardScaler()

In [10]:
# Drop non-numeric columns before scaling
columns_to_drop = ['CustomerID', 'CustomerName', 'SignupDate']
feature_columns = customer_data.drop(columns=columns_to_drop).columns

customer_data[feature_columns] = scaler.fit_transform(customer_data[feature_columns])

In [11]:
# Check the data after preprocessing
print(customer_data.head())

      index CustomerID        CustomerName  SignupDate  TransactionID  \
0 -1.723369      C0001    Lawrence Carroll  2022-07-10      -0.011458   
1 -1.705961      C0002      Elizabeth Lutz  2022-02-13      -0.467494   
2 -1.688553      C0003      Michael Rivera  2024-03-07      -0.467494   
3 -1.671145      C0004  Kathleen Rodriguez  2022-10-09       1.356650   
4 -1.653738      C0005         Laura Weber  2022-08-15      -0.923530   

   Region_Asia  Region_Europe  Region_North America  Region_South America  \
0    -0.532795      -0.579284             -0.548319              1.540416   
1     1.876893      -0.579284             -0.548319             -0.649175   
2    -0.532795      -0.579284             -0.548319              1.540416   
3    -0.532795      -0.579284             -0.548319              1.540416   
4     1.876893      -0.579284             -0.548319             -0.649175   

   Category_Books  Category_Clothing  Category_Electronics  \
0       -0.548319          -0.517219

# **Model Devlopment**

In [12]:
# Ensure only numeric columns are used for similarity computation
customer_features = customer_data[feature_columns]
cosine_sim = cosine_similarity(customer_features)

# Convert cosine similarity matrix to a DataFrame
similarity_df = pd.DataFrame(cosine_sim, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Function to get top 3 lookalikes for each customer
def get_top_lookalikes(customer_id, n=3):
    scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_lookalikes = scores.iloc[1:n+1]  # Exclude the customer itself
    return list(zip(top_lookalikes.index, top_lookalikes.values))

# Generate lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:
    lookalikes[customer_id] = get_top_lookalikes(customer_id)

# Convert lookalikes to DataFrame
lookalikes_df = pd.DataFrame({'CustomerID': lookalikes.keys(), 'Lookalikes': lookalikes.values()})

In [13]:
# Save to CSV
lookalikes_df.to_csv('Krushit_Ghevariya_Lookalike.csv', index=False)