In [8]:
import pandas as pd

# Load the dataset
file_path = "Customers.csv"  # Replace with your actual path
data = pd.read_csv(file_path)

# Display basic information
print("Dataset Overview:")
print(data.info())
print("\nSample Data:")
print(data.head())


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None

Sample Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


In [10]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 1: Handle missing values
data.fillna(data.mean(numeric_only=True), inplace=True)  # For numerical columns
data.fillna("Unknown", inplace=True)  # For categorical columns

# Step 2: Encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Step 3: Scale numerical variables
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

# Check if `numerical_cols` contains valid columns
if not numerical_cols.empty:
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
else:
    print("No numerical columns found for scaling.")

# Display the processed data
print("Preprocessed Data Sample:")
print(data.head())


No numerical columns found for scaling.
Preprocessed Data Sample:
   CustomerID  CustomerName  Region  SignupDate
0           0           119       3          34
1           1            54       0           3
2           2           137       3         127
3           3           103       3          46
4           4           116       0          36


In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 1: Handle missing values
data.fillna(data.mean(numeric_only=True), inplace=True)  # For numerical columns
data.fillna("Unknown", inplace=True)  # For categorical columns

# Step 2: Convert SignupDate to a numeric feature (if it's a date)
# Assuming 'SignupDate' is the number of days since signup, so convert it to numeric
if 'SignupDate' in data.columns:
    data['SignupDate'] = pd.to_numeric(data['SignupDate'], errors='coerce')

# Step 3: Encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Step 4: Scale numerical variables (excluding CustomerID and CustomerName)
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

# Exclude 'CustomerID' and 'CustomerName' if they are present
numerical_cols = [col for col in numerical_cols if col not in ['CustomerID', 'CustomerName']]

# Scale the remaining numerical columns
if numerical_cols:
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
else:
    print("No numerical columns found for scaling.")

# Display the processed data
print("Preprocessed Data Sample:")
print(data.head())


No numerical columns found for scaling.
Preprocessed Data Sample:
   CustomerID  CustomerName  Region  SignupDate
0           0           119       3          34
1           1            54       0           3
2           2           137       3         127
3           3           103       3          46
4           4           116       0          36


In [12]:
# Step 1: Print data types of all columns to check if they're correctly recognized
print("Data Types of Columns:")
print(data.dtypes)

# Step 2: Convert 'Region' and 'SignupDate' to numeric if they are not already
data['Region'] = pd.to_numeric(data['Region'], errors='coerce')  # Force conversion to numeric
data['SignupDate'] = pd.to_numeric(data['SignupDate'], errors='coerce')  # Force conversion to numeric

# Step 3: Handle missing values (since we forced conversion to numeric, some values might have become NaN)
data.fillna(data.mean(numeric_only=True), inplace=True)  # For numerical columns
data.fillna("Unknown", inplace=True)  # For categorical columns

# Step 4: Re-check the data types after conversion
print("Data Types of Columns After Conversion:")
print(data.dtypes)

# Step 5: Encode categorical variables (if any)
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Step 6: Identify numerical columns and scale them
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

# Exclude 'CustomerID' and 'CustomerName' if they are present
numerical_cols = [col for col in numerical_cols if col not in ['CustomerID', 'CustomerName']]

# Scale the remaining numerical columns
if numerical_cols:
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
else:
    print("No numerical columns found for scaling.")

# Display the processed data
print("Preprocessed Data Sample:")
print(data.head())


Data Types of Columns:
CustomerID      int32
CustomerName    int32
Region          int32
SignupDate      int32
dtype: object
Data Types of Columns After Conversion:
CustomerID      int32
CustomerName    int32
Region          int32
SignupDate      int32
dtype: object
No numerical columns found for scaling.
Preprocessed Data Sample:
   CustomerID  CustomerName  Region  SignupDate
0           0           119       3          34
1           1            54       0           3
2           2           137       3         127
3           3           103       3          46
4           4           116       0          36


In [29]:
# Step 1: Print data types of all columns to check if they're correctly recognized
print("Data Types of Columns:")
print(data.dtypes)

# Step 2: Convert 'Region' and 'SignupDate' to numeric if they are not already
data['Region'] = pd.to_numeric(data['Region'], errors='coerce')  # Force conversion to numeric
data['SignupDate'] = pd.to_numeric(data['SignupDate'], errors='coerce')  # Force conversion to numeric

# Step 3: Handle missing values (since we forced conversion to numeric, some values might have become NaN)
data.fillna(data.mean(numeric_only=True), inplace=True)  # For numerical columns
data.fillna("Unknown", inplace=True)  # For categorical columns

# Step 4: Re-check the data types after conversion
print("Data Types of Columns After Conversion:")
print(data.dtypes)

# Step 5: Encode categorical variables (if any)
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Step 6: Identify numerical columns explicitly and scale them
# Include 'Region' and 'SignupDate' as numerical columns for scaling
numerical_cols = ['Region', 'SignupDate']

# Scale the numerical columns
if numerical_cols:
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
else:
    print("No numerical columns found for scaling.")

# Display the processed data
print("Preprocessed Data Sample:")
print(data.head())


Data Types of Columns:
CustomerID        int32
CustomerName      int32
Region          float64
SignupDate      float64
dtype: object
Data Types of Columns After Conversion:
CustomerID        int32
CustomerName      int32
Region          float64
SignupDate      float64
dtype: object
Preprocessed Data Sample:
   CustomerID  CustomerName    Region  SignupDate
0           0           119  1.241384   -1.062450
1           1            54 -1.409258   -1.647769
2           2           137  1.241384    0.693509
3           3           103  1.241384   -0.835874
4           4           116 -1.409258   -1.024687


In [34]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Example customer data (Region and SignupDate)
df_customers = pd.DataFrame({
    'CustomerID': [0, 1, 2, 3, 4],
    'CustomerName': [119, 54, 137, 103, 116],
    'Region': [3, 0, 3, 3, 0],
    'SignupDate': [34, 3, 127, 46, 36]
})

# Function to calculate similarity (using Region and SignupDate)
def calculate_similarity(user_data, customer_profiles):
    similarity_scores = cosine_similarity([user_data], customer_profiles)
    return similarity_scores.flatten()

# Function to recommend lookalikes based on similarity
def recommend_lookalikes(user_input, df_customers):
    # Extract relevant customer profile columns (Region and SignupDate)
    customer_profiles = df_customers[['Region', 'SignupDate']]
    
    # Calculate similarity scores
    similarity_scores = calculate_similarity(user_input['TransactionData'], customer_profiles)
    
    # Get top 3 most similar customers
    top_lookalikes = similarity_scores.argsort()[-3:][::-1]
    recommendations = [
        {'CustomerID': user_input['CustomerID'], 
         'LookalikeCustomerID': df_customers.iloc[i]['CustomerID'], 
         'SimilarityScore': similarity_scores[i]} 
        for i in top_lookalikes
    ]
    
    return recommendations

# Example user input (make sure 'TransactionData' matches the format: Region and SignupDate)
user_input = {'CustomerID': 101, 'TransactionData': [3, 34]}  # Example Region=3, SignupDate=34

# Get recommendations
recommendations = recommend_lookalikes(user_input, df_customers)

# Convert recommendations to a DataFrame and save to CSV
df_recommendations = pd.DataFrame(recommendations)

# Ensure the DataFrame is written to CSV
df_recommendations.to_csv("Lookalike_Recommendations.csv", index=False)

# Show the recommendations and confirm the CSV output
print("Recommendations:")
print(df_recommendations)

# Confirm the CSV creation
print("\nCSV file 'Lookalike_Recommendations.csv' created successfully.")


Recommendations:
   CustomerID  LookalikeCustomerID  SimilarityScore
0         101                    0         1.000000
1         101                    3         0.999738
2         101                    2         0.997928

CSV file 'Lookalike_Recommendations.csv' created successfully.


In [38]:
from IPython.display import FileLink

# Display download link for the CSV file
FileLink(r'Lookalike_Recommendations.csv')
