In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv("Scoring-Dataset-5.csv")

#### A. Initial data exploration 

##### A2. Calculate and analyze descriptive statistics for each attribute

1. Frequency of Values for Each Attributes

In [None]:
for column in data.select_dtypes(include=['object']).columns:
    if column != "User_ID":
        value_counts = data[column].value_counts()
        value_counts.plot(kind='bar', color="#1f77b4", edgecolor="black")
        plt.title(f'Frequency of Values for {column}')
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.show()

2. Distribution, Measures of Central Tendency & Measures of Spread

In [None]:
for column in data.select_dtypes(include=['int64', 'float64']).columns:
    if column != "User_ID":
        mean_value = data[column].mean()
        median_value = data[column].median()
        variance_value = data[column].var()
        percentiles_value = data[column].quantile([0.25, 0.5, 0.75])
        data[column].plot(kind='hist', bins=50, color="#1f77b4", edgecolor="black")
        plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')
        plt.title(f"Distribution of {column} (Mean: {mean_value:.2f}) (Median: {median_value:.2f})\n(Variance: {variance_value:.2f})")
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.legend()
        plt.show()
        print(percentiles_value)

Identify Outliers

In [None]:
for column in data.select_dtypes(include=['int64', 'float64']).columns:
    if column != "User_ID":
        data[column].plot(kind="box", sym="o", patch_artist=True, 
                        boxprops={"facecolor": "#1f77b4", "linewidth": 0},
                        medianprops={"color": "white", "linewidth": 2},
                        whiskerprops={"color": "#1f77b4", "linewidth": 2},
                        capprops={"color": "#1f77b4", "linewidth": 2})
        plt.title(f'Boxplot for {column}')
        plt.show()

##### A3. Advanced exploration

In [None]:
data_copy = data.copy()
# Identify categorical columns
categorical_columns = [col for col in data_copy.columns if data_copy[col].dtype == "object"]

# Apply label encoding to categorical columns
le = LabelEncoder()
for col in categorical_columns:
    data_copy[col] = le.fit_transform(data_copy[col])

sns.set(style="ticks")
sns.pairplot(data_copy.drop(columns=["User_ID","Browsed_Electronics_12Mo","Bought_Electronics_12Mo","Bought_Digital_Media_18Mo","Bought_Digital_Books"]), hue="Gender", markers=["o", "s"])
plt.show()

#### B. Data pre-processing 

B1. Binning and Smoothing

In [None]:
# Load the data (replace with your actual data or file path)
# Option 1: Provide data directly  # Replace with your age values

# Option 2: Read from CSV file
ages = data['Age']
# Equi-width binning
bins_equi_width = pd.cut(ages, bins=3, labels=['bin1','bin2','bin3'])  # Labels=False for bin indices

# Equi-depth binning
bins_equi_depth = pd.qcut(ages, q=3, labels=['bin1','bin2','bin3'])

# Print the bin assignments for each technique

import pandas as pd


# Add new columns for bin assignments
data["Equi-Width Bin"] = bins_equi_width
data["Equi-Depth Bin"] = bins_equi_depth



# Select only the desired columns

smothed_width_data_bin1 = int(data[data["Equi-Width Bin"]=='bin1']['Age'].mean())
smothed_width_data_bin2 = int(data[data["Equi-Width Bin"]=='bin2']['Age'].mean())
smothed_width_data_bin3 = int(data[data["Equi-Width Bin"]=='bin3']['Age'].mean())
smothed_dipth_data_bin1 = int(data[data["Equi-Depth Bin"]=='bin1']['Age'].mean())
smothed_dipth_data_bin2 = int(data[data["Equi-Depth Bin"]=='bin2']['Age'].mean())
smothed_dipth_data_bin3 = int(data[data["Equi-Depth Bin"]=='bin3']['Age'].mean())
data['smothed_width_data'] = 4
data['smothed_depth_data'] = 4
for index,row in data.iterrows():
        #smothing in width
        if row['Equi-Width Bin']=='bin1':
            data.at[index,'smothed_width_data'] = smothed_width_data_bin1
        elif row['Equi-Width Bin']=='bin2':
            data.at[index,'smothed_width_data'] = smothed_width_data_bin2
        else:
            data.at[index,'smothed_width_data'] = smothed_width_data_bin3
        #smothing in depth    
        if row['Equi-Depth Bin']=='bin1':
            data.at[index,'smothed_depth_data'] = smothed_dipth_data_bin1  
        elif row['Equi-Depth Bin']=='bin2':
             data.at[index,'smothed_depth_data'] = smothed_dipth_data_bin2
        else:
             data.at[index,'smothed_depth_data'] = smothed_dipth_data_bin3     
                 

    
binned_data = data[["Equi-Width Bin", "Equi-Depth Bin","smothed_width_data","smothed_depth_data"]]   

print(binned_data.head(10))
# Export the selected columns to a new CSV file
binned_data.to_csv('bins_only.csv', index=False)  # Adjust filename as needed


##### B2. Normalize "Age" attribute.

1. Min-max normalization.

In [None]:
# Get the minimum and maximum values of the "ages" column
min_age = data["Age"].min()
max_age = data["Age"].max()

# Scale the "ages" column using min-max scaling
data["Ages_scaled"] = (data["Age"] - min_age) / (max_age - min_age)

# Save the scaled data to a new CSV file (optional)
data.to_csv("scaled_data.csv", index=False)
print(data["Ages_scaled"])

2. Z-score normalization.

In [None]:

# Z-score normalization for 'Age'
mean_age = data['Age'].mean()
std_age = data['Age'].std()

data['Age_ZScore'] = (data['Age'] - mean_age) / std_age

data.to_csv("scaled_data.csv", index=False)
print(data['Age_ZScore'])

##### B3. Discretize "Age" attribute:

1. Divides the continuous "Age" into five categories: Teenager, Young, Mid-Age, Mature, and Old.

In [None]:
# Define age categories
bins = [1, 16, 35, 55, 70, 150]
labels = ['Teenager', 'Young', 'Mid_Age', 'Mature', 'Old']

# Create a new column for age categories
data['Age_Category'] = pd.cut(data['Age'], bins=bins, labels=labels)

# Display the DataFrame with the new 'Age_Category' column
data.to_csv("scaled_data.csv", index=False)

# Display the frequency of each category
category_counts = data['Age_Category'].value_counts()
print(f"\nFrequency of each category:\n{category_counts}")

##### B4. Binarize "Gender" attribute:

1. Converts the categorical "Gender" variable into two binary variables

In [None]:
# Map 'Gender' to binary values
gender_mapping = {'M': 1, 'F': 0}
data['Gender_Binary'] = data['Gender'].map(gender_mapping)

data.to_csv("scaled_data.csv", index=False)

print(data[['Gender','Gender_Binary']])

#### C. Association Rules Mining

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset (assuming headers are present)
data2 = pd.read_csv("Community-participation-Dataset(5).csv")

# Preprocess the data2
data2 = data2[['Family', 'Hobbies', 'Social_Club', 'Political', 'Professional', 'Religious', 'Support_Group','Gender']]
data2 = data2.fillna(False)
data2 = data2.replace({"Yes": True, "No": False,"M":True ,"F":False })


# Apply Apriori algorithm (directly using the DataFrame)
frequent_itemsets = apriori(data2, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Display rules with additional metrics
print(rules[['antecedents', 'consequents', 'support', 'confidence']])
dataf = rules[['antecedents', 'consequents', 'support', 'confidence']]
dataf.to_csv("Apriori.csv",index= False)