## Kickstarter Campaign Success Predictor
Can we use machine learning to predict the success of a Kickstarter campaign before it is released?

In [None]:
# Installing initial libraries for data cleanup & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

: 

In [None]:
# Ignore warnings / Cleans up output
import warnings 
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns', None) # See all columns
pd.set_option('display.max_rows', 100) # More rows visible

In [None]:
# Load and preview the data
# df = pd.read_csv('C:/Users/jlynn/Data_Projects/kickstarter_predictive_modeling/data/raw/ks-projects-201612.csv',encoding='latin-1')
df = pd.read_csv('C:/Users/jlynn/Data_Projects/kickstarter_predictive_modeling/data/raw/ks-projects-201612.csv', encoding='latin-1', quotechar='"', escapechar='\\')
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Find total number of missing values by column
df.isnull().sum()

In [None]:
# Find column names
df.columns

In [None]:
# Rename columns (removing spaces)
df.columns = df.columns.str.strip()
df.columns

In [None]:
# Look at rows with missing values
# NaN values in 'name' column are okay
# NaN values in 'category' column are problematic - Data needs to be shifted to the left
df[df['category'].isnull()]

In [None]:
# Identify rows with missing values in the 'category' column
shifted_mask = df['category'].isnull()

# Shift data to the left for rows with missing 'category' values
df.loc[shifted_mask, 'category'] = df.loc[shifted_mask, 'main_category'].values

# Verify the shift
df[df['category'].isnull()]

In [None]:
# Find updated total number of missing values by column
df.isnull().sum()

In [None]:
# Look at rows with missing values
df[df['country'].isnull()]

In [None]:
# Check how many rows have matching category and main_category
matching = df[df['category'] == df['main_category']]
print(f"Rows where category == main_category: {len(matching)} out of {len(df)}")
print(f"Percentage: {len(matching)/len(df)*100:.1f}%")

In [None]:
# Find rows where country is null (these need shifting right)
needs_fix = df['country'].isna()

print(f"Rows needing fix: {needs_fix.sum()}")

# Shift each column individually (work backwards)
df.loc[needs_fix, 'Unnamed: 16'] = df.loc[needs_fix, 'Unnamed: 15'].values
df.loc[needs_fix, 'Unnamed: 15'] = df.loc[needs_fix, 'Unnamed: 14'].values
df.loc[needs_fix, 'Unnamed: 14'] = df.loc[needs_fix, 'Unnamed: 13'].values
df.loc[needs_fix, 'Unnamed: 13'] = df.loc[needs_fix, 'usd pledged'].values
df.loc[needs_fix, 'usd pledged'] = df.loc[needs_fix, 'country'].values
df.loc[needs_fix, 'country'] = df.loc[needs_fix, 'backers'].values
df.loc[needs_fix, 'backers'] = df.loc[needs_fix, 'state'].values
df.loc[needs_fix, 'state'] = df.loc[needs_fix, 'pledged'].values
df.loc[needs_fix, 'pledged'] = df.loc[needs_fix, 'launched'].values
df.loc[needs_fix, 'launched'] = df.loc[needs_fix, 'goal'].values
df.loc[needs_fix, 'goal'] = df.loc[needs_fix, 'deadline'].values
df.loc[needs_fix, 'deadline'] = df.loc[needs_fix, 'currency'].values
df.loc[needs_fix, 'currency'] = df.loc[needs_fix, 'main_category'].values
df.loc[needs_fix, 'main_category'] = df.loc[needs_fix, 'category'].values

# Duplicate main_category into category (since it shifted right)
df.loc[needs_fix, 'category'] = df.loc[needs_fix, 'main_category']

# Verify
print(f"\nNull country after fix: {df['country'].isna().sum()}")
df[needs_fix].head()

In [None]:
# Find updated total number of missing values by column
df.isnull().sum()

In [None]:
# Look at rows with missing values
df[df['usd pledged'].isnull()]

In [None]:
# Replace NaN values in 'usd pledged' with 0
df['usd pledged'] = df['usd pledged'].fillna(0)

# Verify
df[df['usd pledged'].isnull()]

In [None]:
# Find rows where country is 'N,"0' or starts with 'N,'
mask = df['country'].str.contains('N,', na=False)

print(f"Rows with malformed country: {mask.sum()}")

# Replace country with first 2 characters of currency for those rows
df.loc[mask, 'country'] = df.loc[mask, 'currency'].str[:2]

# Verify the fix
print(f"\nCountries after fix:")
print(df.loc[mask, ['country', 'currency']].head())
# Check if any 'N,"0' values remain
print(f"\nRemaining 'N,\"0' values: {(df['country'] == 'N,\"0').sum()}")

In [None]:
# Find updated total number of missing values by column
df.isnull().sum()

In [None]:
# Look at the Unnamed columns for any remaining values
# Unnamed: 13 is not empty because 'name' and 'category' should be combined into one column, and 'main_category' shifted right one time
# Unnamed: 14 is not empyt because 'name', 'category', and 'main_category' should be combined into one column, and 'currency' shifted right two times
# Unnamed: 15 is not empty because 'name', 'category', 'main_category', and 'currency' should be combined into one column, and 'deadline' shifted right three times
# Unnamed: 16 is not empty because 'name', 'category', 'main_category', 'currency', and 'deadline' should be combined into one column, and 'goal' shifted right four times
df[df['Unnamed: 16'].notna()]

In [None]:
# Fix rows where Unnamed: 16 has data (most broken - shifted 4 columns)
mask_16 = df['Unnamed: 16'].notna()
print(f"Rows with Unnamed: 16 data: {mask_16.sum()}")

# For these rows:
# 1. Combine name through deadline into the name column
df.loc[mask_16, 'name'] = (df.loc[mask_16, 'name'].astype(str) + ', ' + 
                             df.loc[mask_16, 'category'].astype(str) + ', ' + 
                             df.loc[mask_16, 'main_category'].astype(str) + ', ' + 
                             df.loc[mask_16, 'currency'].astype(str) + ', ' + 
                             df.loc[mask_16, 'deadline'].astype(str))
# 2. Shift everything from goal onwards back to category (4 positions left)
df.loc[mask_16, 'category'] = df.loc[mask_16, 'goal'].values
df.loc[mask_16, 'main_category'] = df.loc[mask_16, 'launched'].values
df.loc[mask_16, 'currency'] = df.loc[mask_16, 'pledged'].values
df.loc[mask_16, 'deadline'] = df.loc[mask_16, 'state'].values
df.loc[mask_16, 'goal'] = df.loc[mask_16, 'backers'].values
df.loc[mask_16, 'launched'] = df.loc[mask_16, 'country'].values
df.loc[mask_16, 'pledged'] = df.loc[mask_16, 'usd pledged'].values
df.loc[mask_16, 'state'] = df.loc[mask_16, 'Unnamed: 13'].values
df.loc[mask_16, 'backers'] = df.loc[mask_16, 'Unnamed: 14'].values
df.loc[mask_16, 'country'] = df.loc[mask_16, 'Unnamed: 15'].values
df.loc[mask_16, 'usd pledged'] = df.loc[mask_16, 'Unnamed: 16'].values

# 3. Clear the Unnamed columns for these rows
df.loc[mask_16, ['Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16']] = None

# Verify
print(f"\nRemaining Unnamed: 16 values: {df['Unnamed: 16'].notna().sum()}")
df[mask_16].head()

In [None]:
# Fix rows where Unnamed: 15 has data (shifted 3 columns)
mask_15 = df['Unnamed: 15'].notna()
print(f"Rows with Unnamed: 15 data: {mask_15.sum()}")

# For these rows:
# 1. Combine name through currency into the name column
df.loc[mask_15, 'name'] = (df.loc[mask_15, 'name'].astype(str) + ', ' + 
                             df.loc[mask_15, 'category'].astype(str) + ', ' + 
                             df.loc[mask_15, 'main_category'].astype(str) + ', ' + 
                             df.loc[mask_15, 'currency'].astype(str))

# 2. Shift everything from deadline onwards back to category (3 positions left)
df.loc[mask_15, 'category'] = df.loc[mask_15, 'deadline'].values
df.loc[mask_15, 'main_category'] = df.loc[mask_15, 'goal'].values
df.loc[mask_15, 'currency'] = df.loc[mask_15, 'launched'].values
df.loc[mask_15, 'deadline'] = df.loc[mask_15, 'pledged'].values
df.loc[mask_15, 'goal'] = df.loc[mask_15, 'state'].values
df.loc[mask_15, 'launched'] = df.loc[mask_15, 'backers'].values
df.loc[mask_15, 'pledged'] = df.loc[mask_15, 'country'].values
df.loc[mask_15, 'state'] = df.loc[mask_15, 'usd pledged'].values
df.loc[mask_15, 'backers'] = df.loc[mask_15, 'Unnamed: 13'].values
df.loc[mask_15, 'country'] = df.loc[mask_15, 'Unnamed: 14'].values
df.loc[mask_15, 'usd pledged'] = df.loc[mask_15, 'Unnamed: 15'].values

# 3. Clear the Unnamed columns for these rows
df.loc[mask_15, ['Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']] = None

# Verify
print(f"\nRemaining Unnamed: 15 values: {df['Unnamed: 15'].notna().sum()}")
df[mask_15].head()

In [None]:
# Fix rows where Unnamed: 14 has data (shifted 2 columns)
mask_14 = df['Unnamed: 14'].notna()
print(f"Rows with Unnamed: 14 data: {mask_14.sum()}")

# For these rows:
# 1. Combine name through main_category into the name column
df.loc[mask_14, 'name'] = (df.loc[mask_14, 'name'].astype(str) + ', ' + 
                             df.loc[mask_14, 'category'].astype(str) + ', ' + 
                             df.loc[mask_14, 'main_category'].astype(str))

# 2. Shift everything from currency onwards back to category (2 positions left)
df.loc[mask_14, 'category'] = df.loc[mask_14, 'currency'].values
df.loc[mask_14, 'main_category'] = df.loc[mask_14, 'deadline'].values
df.loc[mask_14, 'currency'] = df.loc[mask_14, 'goal'].values
df.loc[mask_14, 'deadline'] = df.loc[mask_14, 'launched'].values
df.loc[mask_14, 'goal'] = df.loc[mask_14, 'pledged'].values
df.loc[mask_14, 'launched'] = df.loc[mask_14, 'state'].values
df.loc[mask_14, 'pledged'] = df.loc[mask_14, 'backers'].values
df.loc[mask_14, 'state'] = df.loc[mask_14, 'country'].values
df.loc[mask_14, 'backers'] = df.loc[mask_14, 'usd pledged'].values
df.loc[mask_14, 'country'] = df.loc[mask_14, 'Unnamed: 13'].values
df.loc[mask_14, 'usd pledged'] = df.loc[mask_14, 'Unnamed: 14'].values

# 3. Clear the Unnamed columns for these rows
df.loc[mask_14, ['Unnamed: 13', 'Unnamed: 14']] = None

# Verify
print(f"\nRemaining Unnamed: 14 values: {df['Unnamed: 14'].notna().sum()}")
df[mask_14].head()

In [None]:
# Fix rows where Unnamed: 13 has data (shifted 1 column)
mask_13 = df['Unnamed: 13'].notna()
print(f"Rows with Unnamed: 13 data: {mask_13.sum()}")

# For these rows:
# 1. Combine name and category into the name column
df.loc[mask_13, 'name'] = (df.loc[mask_13, 'name'].astype(str) + ', ' + 
                             df.loc[mask_13, 'category'].astype(str))

# 2. Shift everything from main_category onwards back to category (1 position left)
df.loc[mask_13, 'category'] = df.loc[mask_13, 'main_category'].values
df.loc[mask_13, 'main_category'] = df.loc[mask_13, 'currency'].values
df.loc[mask_13, 'currency'] = df.loc[mask_13, 'deadline'].values
df.loc[mask_13, 'deadline'] = df.loc[mask_13, 'goal'].values
df.loc[mask_13, 'goal'] = df.loc[mask_13, 'launched'].values
df.loc[mask_13, 'launched'] = df.loc[mask_13, 'pledged'].values
df.loc[mask_13, 'pledged'] = df.loc[mask_13, 'state'].values
df.loc[mask_13, 'state'] = df.loc[mask_13, 'backers'].values
df.loc[mask_13, 'backers'] = df.loc[mask_13, 'country'].values
df.loc[mask_13, 'country'] = df.loc[mask_13, 'usd pledged'].values
df.loc[mask_13, 'usd pledged'] = df.loc[mask_13, 'Unnamed: 13'].values

# 3. Clear Unnamed: 13 for these rows
df.loc[mask_13, 'Unnamed: 13'] = None

# Verify ALL Unnamed columns are now clean
print(f"\nRemaining Unnamed: 13 values: {df['Unnamed: 13'].notna().sum()}")
print(f"Remaining Unnamed: 14 values: {df['Unnamed: 14'].notna().sum()}")
print(f"Remaining Unnamed: 15 values: {df['Unnamed: 15'].notna().sum()}")
print(f"Remaining Unnamed: 16 values: {df['Unnamed: 16'].notna().sum()}")

# Final shape check
print(f"\nDataset shape: {df.shape}")
df[mask_13].head()

In [None]:
# Drop Unnamed columns
df = df.drop(columns=['Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'])
print(f"\nDataset shape after dropping Unnamed columns: {df.shape}")

In [None]:
df.head()

In [None]:
# Check for names that end with a comma followed by text (the shifted ones)
pattern = r'.+,\s*(.+)$'  # Matches names ending with ", [text]"

# Find rows where name matches this pattern AND category equals main_category
# (because when shifted, category becomes what should be main_category)
mask = df['name'].str.contains(pattern, regex=True, na=False) & (df['category'] == df['main_category'])

print(f"Rows with name/category split issue: {mask.sum()}")
df[mask][['name', 'category', 'main_category']].head(10)

In [None]:
# Extract the actual name and the misplaced category
df.loc[mask, 'name'] = df.loc[mask, 'name'].str.rsplit(', ', n=1).str[0]

# Create a temporary column with the split data
split_data = df.loc[mask, 'name'].str.rsplit(', ', n=1, expand=True)
actual_name = split_data[0]
actual_category = split_data[1].str.strip() if 1 in split_data.columns else None

# Shift columns to the right (work backwards)
df.loc[mask, 'usd pledged'] = df.loc[mask, 'country'].values
df.loc[mask, 'country'] = df.loc[mask, 'backers'].values
df.loc[mask, 'backers'] = df.loc[mask, 'state'].values
df.loc[mask, 'state'] = df.loc[mask, 'pledged'].values
df.loc[mask, 'pledged'] = df.loc[mask, 'launched'].values
df.loc[mask, 'launched'] = df.loc[mask, 'goal'].values
df.loc[mask, 'goal'] = df.loc[mask, 'deadline'].values
df.loc[mask, 'deadline'] = df.loc[mask, 'currency'].values
df.loc[mask, 'currency'] = df.loc[mask, 'main_category'].values
df.loc[mask, 'main_category'] = df.loc[mask, 'category'].values

# Assign the split data back to the appropriate columns
df.loc[mask, 'name'] = actual_name
df.loc[mask, 'category'] = actual_category

# Fill None values in 'category' with 'main_category' where applicable
df.loc[mask & df['category'].isna(), 'category'] = df.loc[mask & df['category'].isna(), 'main_category']

# Verify the fix
print(f"\nRows with name/category split issue after fix: {df['name'].str.contains(pattern, regex=True, na=False).sum()}")
df[mask][['name', 'category', 'main_category']].head(10)

In [None]:
#df.to_csv('data/cleaned/ks-projects-201612-cleaned.csv', index=False)