In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from lightgbm import LGBMRegressor

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Preprocessing Function
def preprocess_data(file_path):
    city_df = pd.read_csv(file_path)
    city_df['avg_citation_sum'] = city_df['citation_sum'] / city_df['p_count']
    city_df = city_df.dropna(subset=['population', 'gdp_per_capita', 'lat', 'lon', 'safety_index', 'primary_language', 'avg_citation_sum'])

    # Log transformation for skewed features
    city_df['log_population'] = np.log1p(city_df['population'])
    city_df['log_gdp_per_capita'] = np.log1p(city_df['gdp_per_capita'])

    # Create GDP total feature
    city_df['gdp_total'] = city_df['population'] * city_df['gdp_per_capita']

    # Create safety-to-economic ratio
    city_df['safety_to_gdp'] = city_df['safety_index'] / city_df['gdp_per_capita']

    # Language family mapping
    language_family_mapping = {
        'English': 'Indo-European', 'Japanese': 'Japonic', 'Spanish': 'Indo-European', 
        'Thai': 'Tai-Kadai', 'French': 'Indo-European', 'Arabic': 'Afro-Asiatic', 
        'German': 'Indo-European', 'Chinese': 'Sino-Tibetan', 'Italian': 'Indo-European',
        'Portuguese': 'Indo-European', 'Indonesian': 'Austronesian', 'Turkish': 'Turkic',
        'Dutch': 'Indo-European', 'Korean': 'Koreanic', 'Vietnamese': 'Austroasiatic', 
        'Persian (Farsi)': 'Indo-European', 'Polish': 'Indo-European', 'Russian': 'Indo-European', 
        'Swedish': 'Indo-European', 'Norwegian Nynorsk': 'Indo-European', 'Danish': 'Indo-European',
        'Greek': 'Indo-European', 'Afrikaans': 'Indo-European', 'Burmese': 'Sino-Tibetan',
        'Guaraní': 'Tupian', 'Romanian': 'Indo-European', 'Ukrainian': 'Indo-European', 
        'Sinhala': 'Dravidian-Indic', 'Hungarian': 'Uralic', 'Bengali': 'Dravidian-Indic', 
        'Czech': 'Indo-European', 'Nepali': 'Dravidian-Indic', 'Slovak': 'Indo-European', 
        'Finnish': 'Uralic', 'Amharic': 'Afro-Asiatic', 'Serbian': 'Indo-European', 
        'Dzongkha': 'Sino-Tibetan', 'Slovene': 'Indo-European', 'Bulgarian': 'Indo-European', 
        'Aymara': 'Aymaran', 'Croatian': 'Indo-European', 'Belarusian': 'Indo-European', 
        'Kazakh': 'Turkic', 'Macedonian': 'Indo-European', 'Lao': 'Austroasiatic', 
        'Bosnian': 'Indo-European', 'Latvian': 'Indo-European', 'Khmer': 'Austroasiatic', 
        'Estonian': 'Uralic', 'Lithuanian': 'Indo-European', 'Belizean Creole': 'Creole', 
        'Armenian': 'Indo-European', 'Albanian': 'Indo-European', 'Icelandic': 'Indo-European',
        'Dari': 'Indo-European', 'Mongolian': 'Mongolic', 'Montenegrin': 'Indo-European',
        'Chibarwe': 'Niger-Congo', 'Seychellois Creole': 'Creole', 'Malay': 'Austronesian',
        'Azerbaijani': 'Turkic', 'Maldivian': 'Indo-Aryan', 'Kyrgyz': 'Turkic'
    }
    city_df['language_family'] = city_df['primary_language'].map(language_family_mapping).fillna('Other')

    return city_df

# Model Training and Evaluation Function
def evaluate_with_feature_engineering(city_df, feature_columns, target_column):
    # Define preprocess pipeline
    preprocess = ColumnTransformer(transformers=[
        ('num', StandardScaler(), feature_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['language_family'])
    ])

    # Model pipeline
    pipeline = Pipeline([
        ('preprocess', preprocess),
        ('model', RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1))
    ])

    # Split data
    X = city_df[feature_columns + ['language_family']]
    y = city_df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    pipeline.fit(X_train, y_train)

    # Evaluate model
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mae, r2, pipeline

# Main Script
if __name__ == "__main__":
    city_df = preprocess_data('../For_vizml/city_impute_pop_real.csv')

    # Define features and target
    feature_columns = ['log_population', 'log_gdp_per_capita', 'safety_index', 'gdp_total', 'safety_to_gdp']
    target_column = 'avg_citation_sum'
    #target_column = 'p_count'

    mae, r2, best_model = evaluate_with_feature_engineering(city_df, feature_columns, target_column)

    print("Model Evaluation:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R^2 Score: {r2:.2f}")


Model Evaluation:
Mean Absolute Error: 71.72
R^2 Score: -0.06


0.28


In [8]:
import pandas as pd
from itertools import combinations
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings

warnings.filterwarnings('ignore')  # Suppress warnings for clean output

# Preprocessing Function
def preprocess_data(file_path):
    city_df = pd.read_csv(file_path)
    city_df['avg_citation_sum'] = city_df['citation_sum'] / city_df['p_count']
    city_df = city_df.dropna(subset=['population', 'gdp_per_capita', 'lat', 'lon', 'safety_index', 'primary_language', 'avg_citation_sum'])
    city_df['primary_language'] = city_df['primary_language'].apply(lambda x: 1 if x == 'English' else 0)
    
    # Remove outliers using IQR
    Q1 = city_df['avg_citation_sum'].quantile(0.25)
    Q3 = city_df['avg_citation_sum'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    city_df_filtered = city_df[(city_df['avg_citation_sum'] >= lower_bound) & (city_df['avg_citation_sum'] <= upper_bound)]
    
    return city_df_filtered

# Model Training and Evaluation Function with Stratified Sampling
def evaluate_features_with_stratified_sampling(city_df, all_features):
    best_r2 = float('-inf')
    best_mae = float('inf')
    best_features = None
    best_model = None

    # Create bins for stratified sampling
    city_df['stratify_bins'] = pd.qcut(city_df['avg_citation_sum'], q=5, labels=False)

    for features in [list(comb) for i in range(1, len(all_features) + 1) for comb in combinations(all_features, i)]:
        X = city_df[features]
        y = city_df['avg_citation_sum']
        stratify_bins = city_df['stratify_bins']

        # Stratified train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=stratify_bins, test_size=0.2, random_state=42
        )

        # Use a pipeline with scaling and RandomForestRegressor
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Scale features
            ('model', RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1))
        ])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        if r2 > best_r2:
            best_r2 = r2
            best_mae = mae
            best_features = features
            best_model = pipeline

    return best_r2, best_mae, best_features, best_model

# Main Script
if __name__ == "__main__":
    city_df_filtered = preprocess_data('../For_vizml/city_impute_pop_real.csv')
    all_features = ['population', 'gdp_per_capita', 'lat', 'lon', 'safety_index', 'primary_language']

    best_r2, best_mae, best_features, best_model = evaluate_features_with_stratified_sampling(city_df_filtered, all_features)

    print("Best Model Evaluation:")
    print(f"Best R^2 Score: {best_r2:.2f}")
    print(f"Best Mean Absolute Error: {best_mae:.2f}")
    print(f"Best Features: {best_features}")


Best Model Evaluation:
Best R^2 Score: 0.28
Best Mean Absolute Error: 19.72
Best Features: ['population', 'gdp_per_capita', 'lon', 'safety_index', 'primary_language']


In [31]:
city_df = pd.read_csv('../For_vizml/city_impute_pop_real.csv')
city_df['primary_language'].value_counts().index

Index(['English', 'Japanese', 'Spanish', 'Thai', 'French', 'Arabic', 'German',
       'Chinese', 'Italian', 'Portuguese', 'Indonesian', 'Turkish', 'Dutch',
       'Korean', 'Vietnamese', 'Persian (Farsi)', 'Polish', 'Russian',
       'Swedish', 'Norwegian Nynorsk', 'Danish', 'Greek', 'Afrikaans',
       'Burmese', 'Guaraní', 'Romanian', 'Ukrainian', 'Sinhala', 'Hungarian',
       'Bengali', 'Czech', 'Nepali', 'Slovak', 'Finnish', 'Amharic', 'Serbian',
       'Dzongkha', 'Slovene', 'Bulgarian', 'Aymara', 'Croatian', 'Belarusian',
       'Kazakh', 'Macedonian', 'Lao', 'Bosnian', 'Latvian', 'Khmer',
       'Estonian', 'Lithuanian', 'Belizean Creole', 'Armenian', 'Albanian',
       'Icelandic', 'Dari', 'Mongolian', 'Montenegrin', 'Chibarwe',
       'Seychellois Creole', 'Malay', 'Azerbaijani', 'Maldivian', 'Kyrgyz'],
      dtype='object', name='primary_language')

In [26]:
# Predict for a random city
random_city = city_df_filtered.sample(n=1, random_state=10)
city_features = random_city[best_features]
predicted_avg_citation_sum = best_model.predict(city_features)[0]

print(f"Predicted Average Citation Sum for the selected city: {predicted_avg_citation_sum:.2f}")
print("Random City Details:")
print(random_city)

Predicted Average Citation Sum for the selected city: 39.35
Random City Details:
      city_id      city country  citation_sum  p_count        lat         lon  \
1852     1853  Kanazawa   Japan          1264       18  36.561627  136.656882   

      population  gdp_per_capita   time_zone  safety_index  primary_language  \
1852    466029.0    33834.392106  Asia/Tokyo          77.1                 0   

      avg_citation_sum  stratify_bins  
1852         70.222222              4  


Data Set (new)

In [11]:
# Load the data
city_df = pd.read_csv('../For_vizml/city_impute_pop_real.csv')

# Create a new column avg_citation_sum by dividing citation_sum by p_count
city_df['avg_citation_sum'] = city_df['citation_sum'] / city_df['p_count']

# Drop rows with missing values in relevant columns
city_df = city_df.dropna(subset=['population', 'gdp_per_capita', 'lat', 'lon', 'safety_index', 'primary_language', 'avg_citation_sum'])

# Log transform skewed variables
city_df['log_population'] = np.log1p(city_df['population'])
city_df['log_gdp_per_capita'] = np.log1p(city_df['gdp_per_capita'])

# One-hot encode primary_language
city_df = pd.get_dummies(city_df, columns=['primary_language'], drop_first=True)

# KMeans clustering for geospatial grouping
kmeans = KMeans(n_clusters=5, random_state=42)
city_df['geo_cluster'] = kmeans.fit_predict(city_df[['lat', 'lon']])

# Select the features and target variable
X = city_df[['log_population', 'log_gdp_per_capita', 'lat', 'lon', 'safety_index', 'geo_cluster']]
y = np.log1p(city_df['avg_citation_sum'])  # Log-transform the target

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=city_df['geo_cluster']
)

print("Rows with NaN dropped, and train and test datasets are ready!")

Rows with NaN dropped, and train and test datasets are ready!
