In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d0cd50119ad30271d188fada4b9b1e3ab33284a93f3baf3126721271f92471d9
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("BigDataAnalysis").getOrCreate()


In [4]:
spark_df = spark.read.csv('/content/drive/MyDrive/tsmdata/globaltsmdata.csv', header=True, inferSchema=True)


In [5]:
# Show the first few rows
spark_df.show(5)

# Display schema
spark_df.printSchema()


+------------+-----+------+----+----------+--------+----------+-------+------------------+------+--------------------+---------+-------------+---------+----------+-----------+--------+--------+-------+-----+-----+-----+---------+-----------+---------------+--------+-------+-------+-----------+--------------------+-----------+---------------+-----------+---------------+---------+--------------------+------------+--------------------+--------------------+--------------------+-------+------------------+---------+-------------+------------+----------------+-----+-------+-------+-----------+---------+-------------+------------+----------------+-----+-------+-------+-----------+--------------------+--------+------+---------+------+---------+------+-----------+-----------+-----------+----------+------+--------+-------+---------+-------------+------+----------+--------------+------+----------+--------------+---------+---------+-------------+------------+--------------------+---------+---------

In [19]:
# Count the number of rows
row_count = spark_df.count()
print("Number of rows in the dataset:", row_count)


Number of rows in the dataset: 181691


In [7]:

# Count the number of events by country
spark_df.groupBy("country_txt").count().orderBy("count", ascending=False).show()

+--------------+-----+
|   country_txt|count|
+--------------+-----+
|          Iraq|24636|
|      Pakistan|14368|
|   Afghanistan|12731|
|         India|11960|
|      Colombia| 8306|
|   Philippines| 6908|
|          Peru| 6096|
|   El Salvador| 5320|
|United Kingdom| 5235|
|        Turkey| 4292|
|       Somalia| 4142|
|       Nigeria| 3907|
|      Thailand| 3849|
|         Yemen| 3347|
|         Spain| 3249|
|     Sri Lanka| 3022|
| United States| 2836|
|       Algeria| 2743|
|        France| 2693|
|         Egypt| 2479|
+--------------+-----+
only showing top 20 rows



In [8]:
from pyspark.sql.functions import col, isnan, when, count

missing_data = spark_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in spark_df.columns])
missing_data.show()


+-------+-----+------+----+----------+--------+----------+-------+-----------+------+----------+---------+----+--------+---------+-----------+--------+--------+-------+-----+-----+-----+---------+-----------+---------------+--------+-------+-------+-----------+---------------+-----------+---------------+-----------+---------------+---------+-------------+------------+----------------+-----+-------+-------+-----------+---------+-------------+------------+----------------+------+-------+-------+-----------+---------+-------------+------------+----------------+------+-------+-------+-----------+-----+--------+------+---------+------+---------+------+-----------+-----------+-----------+----------+------+--------+-------+---------+-------------+------+----------+--------------+------+----------+--------------+---------+---------+-------------+------------+----------------+---------+-------------+------------+----------------+---------+-------------+------------+----------------+---------+--

In [11]:
# Groups the data by year, counts the number of occurrences for each year
spark_df.groupBy("iyear").count().orderBy("iyear").show()

+-----+-----+
|iyear|count|
+-----+-----+
| 1970|  651|
| 1971|  471|
| 1972|  568|
| 1973|  473|
| 1974|  581|
| 1975|  740|
| 1976|  923|
| 1977| 1319|
| 1978| 1526|
| 1979| 2662|
| 1980| 2662|
| 1981| 2586|
| 1982| 2544|
| 1983| 2870|
| 1984| 3495|
| 1985| 2915|
| 1986| 2860|
| 1987| 3183|
| 1988| 3721|
| 1989| 4324|
+-----+-----+
only showing top 20 rows



In [11]:
spark_df.groupBy("region_txt").count().orderBy("count", ascending=False).show(10)

+--------------------+-----+
|          region_txt|count|
+--------------------+-----+
|Middle East & Nor...|50474|
|          South Asia|44974|
|       South America|18978|
|  Sub-Saharan Africa|17550|
|      Western Europe|16639|
|      Southeast Asia|12485|
|Central America &...|10344|
|      Eastern Europe| 5144|
|       North America| 3456|
|           East Asia|  802|
+--------------------+-----+
only showing top 10 rows



In [12]:
spark_df.groupBy("attacktype1_txt").count().orderBy("count", ascending=False).show()


+--------------------+-----+
|     attacktype1_txt|count|
+--------------------+-----+
|   Bombing/Explosion|88101|
|       Armed Assault|42566|
|       Assassination|19285|
|Hostage Taking (K...|11115|
|Facility/Infrastr...|10222|
|             Unknown| 7267|
|     Unarmed Assault| 1003|
|Hostage Taking (B...|  989|
|           Hijacking|  657|
|                   0|  167|
|                   1|  100|
|                   3|   56|
|                   7|   52|
|                   2|   40|
|                NULL|   35|
|                   6|   16|
|                   8|    5|
|                   9|    4|
|                   5|    2|
| the Red Hand Def...|    1|
+--------------------+-----+
only showing top 20 rows



In [14]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

# Cast the columns to DoubleType
spark_df = spark_df.withColumn("nkill", col("nkill").cast(DoubleType()))
spark_df = spark_df.withColumn("nwound", col("nwound").cast(DoubleType()))

# Check if the casting was successful by printing the schema
spark_df.printSchema()


root
 |-- eventid: long (nullable = true)
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- iday: integer (nullable = true)
 |-- approxdate: string (nullable = true)
 |-- extended: integer (nullable = true)
 |-- resolution: string (nullable = true)
 |-- country: integer (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region: integer (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- specificity: integer (nullable = true)
 |-- vicinity: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- crit1: string (nullable = true)
 |-- crit2: string (nullable = true)
 |-- crit3: string (nullable = true)
 |-- doubtterr: string (nullable = true)
 |-- alternative: string (nullable = true)
 |-- alternative_txt: string (nullable

In [15]:
correlation = spark_df.corr("nkill", "nwound")
print(f"Correlation between number of kills and number of wounds: {correlation}")


Correlation between number of kills and number of wounds: 0.4437453082033473


In [16]:
pivot_table = spark_df.groupBy("iyear", "country_txt").count().groupBy("iyear").pivot("country_txt").sum("count")
pivot_table.show()

+-----+-----------+-------+-------+-------+------+-------------------+---------+-------+---------+-------+----------+-------+-------+----------+--------+-------+-------+------+-----+------+-------+------------------+--------+------+------+--------+------------+-------+--------+--------+------+------------------------+----+-----+-----+--------+-------+----------+-------+----+------+--------------+--------------+--------------------------------+-------+--------+--------+------------------+------------------+----------+-------+-----+-----------+-----------------+-------+-------+--------+----------------+----+-------+------+-------------+----------------+-----+------+-------+-------+-----+------+-------+----------+---------+------+-------------+------+-----+--------+---------+-------+-------+-----+---------+-------------+----+----+-------+------+-----+-----------+-------+-----+------+----------+-----+------+------+----------+----+------+-------+-------+-------+-----+---------+----------+--

**Data Cleaning**

Cleaning the dataset involves handling missing values and filtering irrelevant data:

In [6]:
# Drop rows with missing values in critical columns
df_clean = spark_df.dropna(subset=["eventid", "iyear", "imonth", "iday", "country_txt", "region_txt", "attacktype1_txt"])

# Filter data for specific analysis, e.g., only incidents post-2000
df_clean = df_clean.filter(df_clean["iyear"] > 2000)


**Basic Analysis**

In [7]:
# Count of Terrorist Attacks by Year
# Group by year and count the number of attacks
attacks_by_year = df_clean.groupBy("iyear").count().orderBy("iyear")
attacks_by_year.show()


+-----+-----+
|iyear|count|
+-----+-----+
| 2001| 1903|
| 2002| 1333|
| 2003| 1278|
| 2004| 1166|
| 2005| 2017|
| 2006| 2757|
| 2007| 3241|
| 2008| 4803|
| 2009| 4720|
| 2010| 4824|
| 2011| 5076|
| 2012| 8520|
| 2013|12036|
| 2014|16902|
| 2015|14963|
| 2016|13585|
| 2017|10898|
+-----+-----+



In [8]:
# Most Common Attack Types
# Group by attack type and count the occurrences
attack_types = df_clean.groupBy("attacktype1_txt").count().orderBy("count", ascending=False)
attack_types.show()


+--------------------+-----+
|     attacktype1_txt|count|
+--------------------+-----+
|   Bombing/Explosion|57877|
|       Armed Assault|25843|
|Hostage Taking (K...| 7869|
|       Assassination| 6821|
|Facility/Infrastr...| 5486|
|             Unknown| 4555|
|     Unarmed Assault|  572|
|Hostage Taking (B...|  358|
|           Hijacking|  305|
|                   0|  116|
|                   1|   74|
|                   3|   45|
|                   2|   37|
|                   7|   35|
|                   6|   14|
|                   8|    5|
|                   9|    3|
|                   5|    2|
| the Red Hand Def...|    1|
|          the choice|    1|
+--------------------+-----+
only showing top 20 rows

