<a href="https://colab.research.google.com/github/lefaa/crimeStudy/blob/main/mergeddataxgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import xgboost as xgb
from datetime import datetime

# Load the data
df_all = pd.read_csv('pivotedmerged.csv')
df_all['date'] = pd.to_datetime(df_all['date'])

# Only keep useful columns
df_all = df_all.loc[:, (df_all != 0).any(axis=0)]  # remove all-zero cols
df_all = df_all[df_all['date'].dt.year < 2023]  # keep only past for training

# Define features and target (predict each country one by one)
features = df_all.drop(columns=['date', 'AF'])  # drop target + date
target = df_all['AF']

# Train model
dtrain = xgb.DMatrix(features, label=target)
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'max_depth': 6, 'eta': 0.1}
model = xgb.train(params, dtrain, num_boost_round=100)

# Future dates to predict for (monthly)
future_months = pd.date_range(start='2024-01-01', end='2024-12-01', freq='MS')

# Use the last known features as a proxy for each future month
# (You can improve this using rolling stats, lag features, etc.)
last_row = features.iloc[-1]

# For each month, duplicate the last row and change the date
future_features = pd.DataFrame([last_row.values]*12, columns=features.columns)
future_features['date'] = future_months

# Predict for each country
countries = [col for col in df_all.columns if col not in ['date']]  # get all countries

all_preds = []

for country in countries:
    y_train = df_all[country]
    dtrain = xgb.DMatrix(features, label=y_train)
    model = xgb.train(params, dtrain, num_boost_round=100)

    df_future = future_features.copy()
    df_future.drop(columns=['date'], inplace=True)
    df_future_dm = xgb.DMatrix(df_future)

    preds = model.predict(df_future_dm)
    df_result = pd.DataFrame({
        'date': future_months,
        'country': country,
        'prediction': preds
    })
    all_preds.append(df_result)

# Concatenate and save
df_final = pd.concat(all_preds, ignore_index=True)
df_final.to_csv("timeseries_predictions.csv", index=False)
print("Predictions saved to timeseries_predictions.csv")


Predictions saved to timeseries_predictions.csv


In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os
from google.colab import drive

# Mount Google Drive and set working dir
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/GbigQuth')
# Load data
df = pd.read_csv('pivotedmerged.csv')
df['date'] = pd.to_datetime(df['date'])
df = df[df['date'].dt.year > 2012]  # Optional filter

# Save the date separately for joining later
dates = df[['date']].copy()
df = df.drop(columns=['date'])

# Remove all-zero columns
df = df.loc[:, (df != 0).any(axis=0)]

# Placeholder to collect predictions
prediction_records = []

# Iterate over each country (i.e., each column)
for target_country in df.columns:
    try:
        print(f"Training for {target_country}...")
        y = df[target_country]
        X = df.drop(columns=[target_country])

        X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
            X, y, dates, test_size=0.2, random_state=42
        )

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'max_depth': 6,
            'eta': 0.1
        }

        model = xgb.train(params, dtrain, num_boost_round=100)

        # Predict on full data
        d_all = xgb.DMatrix(X)
        y_pred_all = model.predict(d_all)

        # Store results
        for dt, pred in zip(dates['date'], y_pred_all):
            prediction_records.append({
                'date': dt,
                'country': target_country,
                'prediction': pred
            })

    except Exception as e:
        print(f"Skipping {target_country} due to error: {e}")

# Create final DataFrame
df_pred = pd.DataFrame(prediction_records)
df_pred.to_csv('timeseries_predictions.csv', index=False)
print("✅ timeseries_predictions.csv saved!")


Mounted at /content/drive
Training for AA...
Training for AC...
Training for AE...
Training for AF...
Training for AG...
Training for AJ...
Training for AL...
Training for AM...
Training for AN...
Training for AO...
Training for AQ...
Training for AR...
Training for AS...
Training for AU...
Training for AV...
Training for AY...
Training for BA...
Training for BB...
Training for BC...
Training for BD...
Training for BE...
Training for BF...
Training for BG...
Training for BH...
Training for BK...
Training for BL...
Training for BM...
Training for BN...
Training for BO...
Training for BP...
Training for BQ...
Training for BR...
Training for BT...
Training for BU...
Training for BX...
Training for BY...
Training for CA...
Training for CB...
Training for CD...
Training for CE...
Training for CF...
Training for CG...
Training for CH...
Training for CI...
Training for CJ...
Training for CM...
Training for CN...
Training for CO...
Training for CQ...
Training for CS...
Training for CT...
Train

In [8]:
!pip install dash pandas xgboost plotly

import dash
from dash import dcc, html, Input, Output
import pandas as pd
import plotly.express as px
import xgboost as xgb

# Load precomputed time-series predictions
# Columns: date, country (ISO-3), prediction
df_pred = pd.read_csv("timeseries_predictions.csv")
!pip install pycountry
import pycountry

def iso2_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None

# Only do this if codes are 2-letter, like "AF", "US", etc.
df_pred['country'] = df_pred['country'].apply(iso2_to_iso3)
df_pred = df_pred.dropna(subset=['country'])

df_pred['date'] = pd.to_datetime(df_pred['date'])

# Get available time points
date_options = sorted(df_pred['date'].unique())

# Start Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H2("Conflict/Terror Forecast Over Time"),

    dcc.Slider(
        id='date-slider',
        min=0,
        max=len(date_options) - 1,
        value=len(date_options) - 1,
        marks={i: date.strftime('%Y-%m') for i, date in enumerate(date_options[::6])},
        step=1
    ),

    dcc.Graph(id='map-plot'),

    html.Div(id='selected-date', style={'fontSize': 20, 'marginTop': '10px'}),

    html.Hr(),

    dcc.Graph(id='country-timeseries'),
])

@app.callback(
    [Output('map-plot', 'figure'),
     Output('selected-date', 'children')],
    Input('date-slider', 'value')
)
def update_map(selected_index):
    selected_date = date_options[selected_index]
    filtered = df_pred[df_pred['date'] == selected_date]

    fig = px.choropleth(
        filtered,
        locations='country',
        locationmode='ISO-3',
        color='prediction',
        color_continuous_scale='OrRd',
        title=f'Predicted Intensity on {selected_date.date()}'
    )
    fig.update_geos(showcountries=True)

    return fig, f"Viewing Date: {selected_date.strftime('%Y-%m')}"

@app.callback(
    Output('country-timeseries', 'figure'),
    Input('map-plot', 'clickData')
)
def update_timeseries(clickData):
    if clickData:
        iso_code = clickData['points'][0]['location']
    else:
        iso_code = 'AF'  # Default

    df_country = df_pred[df_pred['country'] == iso_code]
    fig = px.line(df_country, x='date', y='prediction', title=f'{iso_code} Prediction Over Time')
    return fig

if __name__ == '__main__':
    app.run(debug=True)


Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



<IPython.core.display.Javascript object>