In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("F:\\AssignmentTerotam\\DATA\\RAW\\Global Terrorism Data.csv", encoding="latin1")


  df = pd.read_csv("F:\\AssignmentTerotam\\DATA\\RAW\\Global Terrorism Data.csv", encoding="latin1")


## Step 1: Data Structure Overview


In [5]:
print(df.shape)
print(df.info())
print(df.describe())


(181691, 135)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB
None
            eventid          iyear         imonth           iday  \
count  1.816910e+05  181691.000000  181691.000000  181691.000000   
mean   2.002705e+11    2002.638997       6.467277      15.505644   
std    1.325957e+09      13.259430       3.388303       8.814045   
min    1.970000e+11    1970.000000       0.000000       0.000000   
25%    1.991021e+11    1991.000000       4.000000       8.000000   
50%    2.009022e+11    2009.000000       6.000000      15.000000   
75%    2.014081e+11    2014.000000       9.000000      23.000000   
max    2.017123e+11    2017.000000      12.000000      31.000000   

            extended        country         region       latitude  \
count  181691.000000  181691.000000  181691.000000  177135.000000   
mean        0.045346     131.968501       7.

## Step 2: Missing Value Analysis



In [6]:
df.isnull().sum().sort_values(ascending=False).head(15)


gsubname3           181671
weapsubtype4        181621
weapsubtype4_txt    181621
weaptype4_txt       181618
weaptype4           181618
claimmode3          181558
claimmode3_txt      181558
gsubname2           181531
claim3              181373
guncertain3         181371
divert              181367
gname3              181367
attacktype3         181263
attacktype3_txt     181263
ransomnote          181179
dtype: int64

In this dataset Many columns have missing values but not all columns are useful for analysis

## Step 3: Feature Selection
To simplify the analysis, we select only the most relevant columns related to time, location, attack type, and impact.


In [7]:
cols = [
    'iyear', 'imonth', 'iday',
    'country_txt', 'region_txt', 'city',
    'attacktype1_txt', 'targtype1_txt',
    'weaptype1_txt', 'gname',
    'nkill', 'nwound'
]

df = df[cols]
df.head()


Unnamed: 0,iyear,imonth,iday,country_txt,region_txt,city,attacktype1_txt,targtype1_txt,weaptype1_txt,gname,nkill,nwound
0,1970,7,2,Dominican Republic,Central America & Caribbean,Santo Domingo,Assassination,Private Citizens & Property,Unknown,MANO-D,1.0,0.0
1,1970,0,0,Mexico,North America,Mexico city,Hostage Taking (Kidnapping),Government (Diplomatic),Unknown,23rd of September Communist League,0.0,0.0
2,1970,1,0,Philippines,Southeast Asia,Unknown,Assassination,Journalists & Media,Unknown,Unknown,1.0,0.0
3,1970,1,0,Greece,Western Europe,Athens,Bombing/Explosion,Government (Diplomatic),Explosives,Unknown,,
4,1970,1,0,Japan,East Asia,Fukouka,Facility/Infrastructure Attack,Government (Diplomatic),Incendiary,Unknown,,


In [8]:
# Rename columns
df = df.rename(columns={
    'iyear': 'year',
    'imonth': 'month',
    'iday': 'day',
    'country_txt': 'country',
    'region_txt': 'region',
    'attacktype1_txt': 'attack_type',
    'targtype1_txt': 'target_type',
    'weaptype1_txt': 'weapon_type',
    'nkill':'kill',
    'nwound':'wound',
    'gname':'group_name'
})

df.head()


Unnamed: 0,year,month,day,country,region,city,attack_type,target_type,weapon_type,group_name,kill,wound
0,1970,7,2,Dominican Republic,Central America & Caribbean,Santo Domingo,Assassination,Private Citizens & Property,Unknown,MANO-D,1.0,0.0
1,1970,0,0,Mexico,North America,Mexico city,Hostage Taking (Kidnapping),Government (Diplomatic),Unknown,23rd of September Communist League,0.0,0.0
2,1970,1,0,Philippines,Southeast Asia,Unknown,Assassination,Journalists & Media,Unknown,Unknown,1.0,0.0
3,1970,1,0,Greece,Western Europe,Athens,Bombing/Explosion,Government (Diplomatic),Explosives,Unknown,,
4,1970,1,0,Japan,East Asia,Fukouka,Facility/Infrastructure Attack,Government (Diplomatic),Incendiary,Unknown,,


## Step 4: Handling Missing Numerical Values


In [9]:
df['kill'] = df['kill'].fillna(0)
df['wound'] = df['wound'].fillna(0)

df.isnull().sum()


year             0
month            0
day              0
country          0
region           0
city           435
attack_type      0
target_type      0
weapon_type      0
group_name       0
kill             0
wound            0
dtype: int64

## Step 5: Handling Missing Categorical Values
Missing values in categorical columns are replaced with 'Unknown' to maintain consistency in analysis.


In [10]:
df['city'] = df['city'].fillna('Unknown')

df.isnull().sum()

year           0
month          0
day            0
country        0
region         0
city           0
attack_type    0
target_type    0
weapon_type    0
group_name     0
kill           0
wound          0
dtype: int64

## Step 6: Cleaning Date Information
Some records contain invalid month or day values (0). These are replaced with NaN for consistency.


In [11]:
df.loc[df['month'] == 0, 'month'] = np.nan
df.loc[df['day'] == 0, 'day'] = np.nan

print(df[['month','day']])

        month   day
0         7.0   2.0
1         NaN   NaN
2         1.0   NaN
3         1.0   NaN
4         1.0   NaN
...       ...   ...
181686   12.0  31.0
181687   12.0  31.0
181688   12.0  31.0
181689   12.0  31.0
181690   12.0  31.0

[181691 rows x 2 columns]


## Step 7: Creating a Date Column
A combined date column is created for time-series analysis.


In [12]:
df['date'] = pd.to_datetime(
    df[['year', 'month', 'day']]
)

print(df['date'])


0        1970-07-02
1               NaT
2               NaT
3               NaT
4               NaT
            ...    
181686   2017-12-31
181687   2017-12-31
181688   2017-12-31
181689   2017-12-31
181690   2017-12-31
Name: date, Length: 181691, dtype: datetime64[ns]


## Step 8: Final Data Quality Check
We verify that the dataset is clean and ready for exploratory data analysis.


In [13]:
print(df.isnull().sum())
print(df.describe())


year             0
month           20
day            891
country          0
region           0
city             0
attack_type      0
target_type      0
weapon_type      0
group_name       0
kill             0
wound            0
date           891
dtype: int64
                year          month            day           kill  \
count  181691.000000  181671.000000  180800.000000  181691.000000   
mean     2002.638997       6.467989      15.582058       2.266860   
min      1970.000000       1.000000       1.000000       0.000000   
25%      1991.000000       4.000000       8.000000       0.000000   
50%      2009.000000       6.000000      15.000000       0.000000   
75%      2014.000000       9.000000      23.000000       2.000000   
max      2017.000000      12.000000      31.000000    1570.000000   
std        13.259430       3.387810       8.768099      11.227057   

               wound                           date  
count  181691.000000                         180800  
mean      

In [14]:
# Save cleaned dataset
df.to_csv(
    "F:\AssignmentTerotam\DATA\PROCESSED/terrorism_cleaned.csv",
    index=False
)


  "F:\AssignmentTerotam\DATA\PROCESSED/terrorism_cleaned.csv",
