In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('sample_influencer_database_large.csv')

# Improved function to extract percentages
def extract_percentages(gender_str):
    try:
        # Split by comma to separate male and female parts
        parts = gender_str.split(',')
        
        # Extract male percentage from first part
        male_part = parts[0].strip()  # "XX% Male"
        male_percentage = float(male_part.split('%')[0])
        
        # Extract female percentage from second part
        female_part = parts[1].strip()  # "YY% Female"
        female_percentage = float(female_part.split('%')[0])
        
        return male_percentage, female_percentage
    except Exception as e:
        print(f"Error parsing: '{gender_str}'. Error: {e}")
        return None, None

# Apply the function to create new columns
df[['Male_Percentage', 'Female_Percentage']] = df['Audience_Gender'].apply(
    lambda x: pd.Series(extract_percentages(x))
)

# Drop the original Audience_Gender column
df.drop(columns=['Audience_Gender'], inplace=True)

# Check for missing values
print(df.isnull().sum())

# Save the cleaned dataset
df.to_csv('cleaned_influencer_data.csv', index=False)

# Display the first few rows to verify
print(df.head())

Influencer_ID         0
Name                  0
Platform              0
Followers             0
Engagement_Rate       0
Niche                 0
Avg_Likes             0
Avg_Comments          0
Audience_Age          0
Audience_Location     0
Past_Brand_Collabs    0
Fraud_Score           0
Male_Percentage       0
Female_Percentage     0
dtype: int64
   Influencer_ID          Name   Platform  Followers  Engagement_Rate  \
0              1  Influencer_1     TikTok     765340             6.18   
1              2  Influencer_2  Instagram     270668             4.82   
2              3  Influencer_3     TikTok     352022             2.63   
3              4  Influencer_4  Instagram     983553             5.61   
4              5  Influencer_5    YouTube     315287             6.74   

     Niche  Avg_Likes  Avg_Comments Audience_Age Audience_Location  \
0  Fashion      47298          2252        23-41                UK   
1    Music      13046           931        29-44         Australia   
2 

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter

# Sample data with Hashtags
data = {
    "Influencer_ID": [1, 2, 3],
    "Post_Caption": [
        "Check out my new outfit! 🛍️",
        "Just dropped a new track! 🎶",
        "5 tips to save money 💰"
    ],
    "Hashtags": [
        "#fashion #ootd #style",
        "#music #newrelease #hiphop",
        "#finance #savings #money"
    ],
    "Comments": [
        "Love this look! 😍, Where’s this from?",
        "Fire! 🔥, Can’t wait to hear more!",
        "Great advice!, Very helpful!"
    ]
}

# Convert to DataFrame
df_posts = pd.DataFrame(data)

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    return analyzer.polarity_scores(text)['compound']  # Returns a score between -1 (negative) and 1 (positive)

# Analyze sentiment for captions using VADER
df_posts['Caption_Sentiment_VADER'] = df_posts['Post_Caption'].apply(analyze_sentiment_vader)

# Analyze sentiment for comments using VADER
df_posts['Comments_Sentiment_VADER'] = df_posts['Comments'].apply(analyze_sentiment_vader)

# Display the results
print("Sentiment Analysis Results with VADER:")
print(df_posts[['Influencer_ID', 'Post_Caption', 'Caption_Sentiment_VADER', 'Comments', 'Comments_Sentiment_VADER']])

# Extract hashtags from the sample data
hashtags = []
for tags in df_posts['Hashtags']:
    hashtags.extend(tags.split())

# Count the frequency of each hashtag
hashtag_freq = Counter(hashtags)

# Display the most common hashtags
print("\nMost common hashtags:")
print(hashtag_freq.most_common())

Sentiment Analysis Results with VADER:
   Influencer_ID                 Post_Caption  Caption_Sentiment_VADER  \
0              1  Check out my new outfit! 🛍️                   0.0000   
1              2  Just dropped a new track! 🎶                   0.0000   
2              3       5 tips to save money 💰                   0.4939   

                                Comments  Comments_Sentiment_VADER  
0  Love this look! 😍, Where’s this from?                    0.8172  
1      Fire! 🔥, Can’t wait to hear more!                   -0.6580  
2           Great advice!, Very helpful!                    0.8306  

Most common hashtags:
[('#fashion', 1), ('#ootd', 1), ('#style', 1), ('#music', 1), ('#newrelease', 1), ('#hiphop', 1), ('#finance', 1), ('#savings', 1), ('#money', 1)]


In [3]:

# Calculate Audience Match Score
def calculate_audience_match_score(row):
    # Brand's target audience
    target_age_range = (18, 35)
    target_gender = {'Female': 60, 'Male': 40}
    target_locations = ['USA', 'UK', 'Canada']

    # Extract influencer's audience demographics
    influencer_age_range = tuple(map(int, row['Audience_Age'].split('-')))
    influencer_gender = {'Female': row['Female_Percentage'], 'Male': row['Male_Percentage']}
    influencer_location = row['Audience_Location']

    # Calculate age match
    age_match = 1 if target_age_range[0] <= influencer_age_range[0] and influencer_age_range[1] <= target_age_range[1] else 0

    # Calculate gender match
    gender_match = sum(min(target_gender[g], influencer_gender.get(g, 0)) for g in target_gender) / 100

    # Calculate location match
    location_match = 1 if influencer_location in target_locations else 0

    # Combine scores (equal weights for age, gender, and location)
    audience_match_score = (age_match + gender_match + location_match) / 3 * 100
    return audience_match_score

# Apply the function to calculate Audience Match Score
df['Audience_Match_Score'] = df.apply(calculate_audience_match_score, axis=1)

# Calculate Content Match Score
def calculate_content_match_score(row):
    # Brand's target niches
    target_niches = ['Fashion', 'Tech', 'Fitness']

    # Check if the influencer's niche matches the brand's target niches
    if row['Niche'] in target_niches:
        return 100  # Perfect match
    else:
        return 0  # No match

# Apply the function to calculate Content Match Score
df['Content_Match_Score'] = df.apply(calculate_content_match_score, axis=1)

# Option 1: Create a placeholder Sentiment Score based on engagement
# This is temporary until you have actual sentiment data
df['Sentiment_Score'] = df['Engagement_Rate'] * 10  # Scaling up engagement as a proxy

# Adjust Fraud Score
df['Adjusted_Fraud_Score'] = 100 - (df['Fraud_Score'] * 10)  # Scale from 0-100 since Fraud_Score is 1-10

# Calculate Matching Score
df['Matching_Score'] = (
    df['Engagement_Rate'] * 0.30 +
    df['Audience_Match_Score'] * 0.25 +
    df['Content_Match_Score'] * 0.20 +
    df['Sentiment_Score'] * 0.15 +
    df['Adjusted_Fraud_Score'] * 0.10
)

# Display the results with all components for verification
print(df[['Influencer_ID', 'Name', 'Engagement_Rate', 'Audience_Match_Score', 
         'Content_Match_Score', 'Sentiment_Score', 'Adjusted_Fraud_Score', 
         'Matching_Score']].sort_values(by='Matching_Score', ascending=False).head(10))

# If you want to save the results
df.to_csv('influencer_matching_scores.csv', index=False)

    Influencer_ID           Name  Engagement_Rate  Audience_Match_Score  \
94             95  Influencer_95             8.14             90.000000   
9              10  Influencer_10             8.93             95.000000   
88             89  Influencer_89             7.89             62.000000   
5               6   Influencer_6             8.99             56.333333   
84             85  Influencer_85             7.49             97.666667   
0               1   Influencer_1             6.18             64.333333   
87             88  Influencer_88             8.24             61.333333   
66             67  Influencer_67             8.83             60.333333   
71             72  Influencer_72             6.01             62.333333   
98             99  Influencer_99             9.39             55.666667   

    Content_Match_Score  Sentiment_Score  Adjusted_Fraud_Score  Matching_Score  
94                  100             81.4                    60       63.152000  
9           

In [13]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('influencer_matching_scores.csv')

# Simulate the target variable based on Matching_Score
df['Success'] = df['Matching_Score'].apply(lambda x: 1 if x >= 55 else 0)

# Save the dataset with the Success column
df.to_csv('influencer_matching_scores.csv', index=False)

# Features (X) and target variable (y)
X = df[['Engagement_Rate', 'Audience_Match_Score', 'Content_Match_Score', 'Sentiment_Score', 'Adjusted_Fraud_Score']]
y = df['Success']

# Apply Random Oversampling to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Check the distribution of the resampled target variable
print("Resampled target variable distribution:")
print(y_resampled.value_counts())

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Resampled target variable distribution:
Success
1    91
0    91
Name: count, dtype: int64

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        18

    accuracy                           1.00        37
   macro avg       1.00      1.00      1.00        37
weighted avg       1.00      1.00      1.00        37


Confusion Matrix:
[[19  0]
 [ 0 18]]


In [14]:
print(df['Success'].value_counts())

Success
0    91
1     9
Name: count, dtype: int64
