# Random Sampling from the Larger Database
This script can be used for the random sampling from the larger database. The script takes Cochrane databse as an input and returns the randomized sampling in dataframe. In this script, we have considered 15% o the larger database. One can just change the percentage and sample accordingly.

In [None]:
import pandas as pd
import math
import random

In [None]:
# Read the Cochrane Review Database and drop the rows with Null values. There are 7152 rows in the database
cochrane_df = pd.read_csv("D:/GRA/cochrane.csv")
cochrane_df = cochrane_df.dropna()

In [None]:
# Group by the database using Review Year first and then Parent Group. Count the number of the underlying rows
df1 = cochrane_df.groupby(['Review Year', 'Parent Group']).size().reset_index()
df1.columns = ['Review Year', 'Parent Group', 'Count']
# For sampling, we will collect 15% of the overall data per group. There will be 1112 reviews for sampling.
df1['Number of reviews to consider'] = [math.ceil(i) for i in (df1['Count'] * 0.15)]

In [None]:
# sort the dataset using the Review Year and Parent Group.
df2 = cochrane_df.sort_values(['Review Year', 'Parent Group'], ascending=True).reset_index()
df2 = df2.drop('index', axis = 1)

In [None]:
# Assign the numbers to each row for each group(Starting from 1)
df2['Assign'] = df2.groupby(['Review Year','Parent Group']).cumcount()+1
# Assign the Group index for each group(Starting from 0)
df2['Group index'] = pd.Categorical(df2['Review Year'].astype(str) + df2['Parent Group'].astype(str)).codes

In [None]:
# Generate the random numbers for respective groups to sample
rand_list = df1['Number of reviews to consider'].tolist()
collection_list = df1['Count'].tolist()
select_list = [] 
for i,j in zip(rand_list, collection_list):
    temp = []
    if i != j:
        # Using random.sample function to select random numbers for every group
        temp = random.sample(range(1, j), i)
    else:
        temp = [j]
    select_list.append(temp)

In [None]:
# Select the rows from the database with respective random sampling per group
df3 = pd.DataFrame()

for i in range(len(select_list)):
    # Temporary dataframe to select subset of the dataframe from every group
    temp = df2[(df2['Assign'].isin(select_list[i])) & (df2['Group index'] == i)]
    # Append the dataframe to main dataframe
    df3 = df3.append(temp, ignore_index = True)

In [None]:
df3.to_csv("Cochrane_database_sampling.csv")

In [None]:
df3['Review Type'].value_counts()