# World Bank Science and Technology Data Analysis

## Importing the Data

In [None]:
import pandas as pd

In [None]:
st_data_2018=pd.read_csv("science_tech_2018.csv")
st_data_2018.head()

In [None]:
st_data_2018.shape

In [None]:
st_data_2018.columns

In [None]:
st_data_2009=pd.read_csv("science_tech_2009.csv")
st_data_2009.head()

## Cleaning the Data

### Missing Data

In [None]:
st_data_2018.isna().sum()

In [None]:
st_data_2018_clean= st_data_2018.dropna()
st_data_2018_clean.shape

In [None]:
st_data_2009_clean=st_data_2009.dropna()
st_data_2009_clean.shape

## Exploratory Data Analysis

### Descriptive Statistics

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)
st_data_2018_clean.describe().style.format("{:,.0f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib  inline

In [None]:
sns.boxplot(x=st_data_2018_clean['High-technology exports (% of manufactured exports)'])

In [None]:
sns.boxplot(x=st_data_2009_clean['High-technology exports (% of manufactured exports)'])

## Outliers

In [None]:
columns=st_data_2018_clean.columns
for i in range(1,len(columns)):
    fig=plt.figure(figsize=(10,5))
    sns.boxplot(x=st_data_2018_clean[columns[i]])

In [None]:
articles_max=st_data_2018_clean['Scientific and technical journal articles'].max()

In [None]:
st_data_2018_clean[st_data_2018_clean['Scientific and technical journal articles']==articles_max]

In [None]:
st_data_2018_clean[st_data_2018_clean['Scientific and technical journal articles']>60000]

In [None]:
columns=st_data_2009_clean.columns
for i in range(1,len(columns)):
    fig=plt.figure(figsize=(10,5))
    sns.boxplot(x=st_data_2009_clean[columns[i]])

In [None]:
trade_apps_res_max=st_data_2009_clean['Trademark applications, direct resident'].max()

In [None]:
st_data_2018_clean[st_data_2018_clean['Trademark applications, direct resident']>200000]

## Exploring Relationships

### Correlations

In [None]:
st_data_18_nocountry=st_data_2018_clean.drop('Country Name', axis=1)

In [None]:
st_data_18_nocountry.corr(method='pearson').style.background_gradient(cmap='viridis')

In [None]:
st_data_18_nocountry.corr(method='spearman').style.background_gradient(cmap='viridis')

In [None]:
st_data_2018_clean.plot.scatter(x='Scientific and technical journal articles', y='Patent applications, nonresidents')

In [None]:
st_data_2018_nonUS = st_data_2018_clean[st_data_2018_clean['Country Name']!='United States']

In [None]:
st_data_2018_nonUS.plot.scatter(x='Scientific and technical journal articles', y='Patent applications, nonresidents')

In [None]:
st_data_2018_clean.plot.scatter(x='Research and development expenditure (% of GDP)', y='High-technology exports (current US$)')

In [None]:
import scipy.stats as stats

In [None]:
a=st_data_2009_clean['Patent applications, residents']
b=st_data_2018_clean['Patent applications, residents']
stats.ttest_ind(a,b)

In [None]:
a=st_data_2009_clean[columns[i]]
b=st_data_2018_clean[columns[i]]

In [None]:
for i in range(1,len(columns)):
    a=st_data_2009_clean[columns[i]]
    b=st_data_2018_clean[columns[i]]
    print(columns[i])
    statistic, pvalue = stats.ttest_ind(a,b)
    print("Statistic: %s p-value: %s" %(statistic, pvalue))
    if pvalue < 0.05:
        print("Significant")
    else:
        print("Not Significant")

In [None]:
st_data_2009.head()