In [None]:
# System variables
import os 
from glob import glob

# Data processing libraries
import numpy as np
import pandas as pd

# Dataset connection
import opendatasets as od

# Graphic tools 
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt

#Final word cloud after all the cleaning and pre-processing
from wordcloud import WordCloud, STOPWORDS

In [None]:
# Download dataset
od.download("https://www.kaggle.com/datasets/yasirabdaali/artificial-intelligence-journals-ranking-20002021")

In [None]:
def path_csvFiles (PATH = os.getcwd(),EXT = "*.csv"):  
    """
    Retrieve all files with a given extension (EXT = *.csv by default) from current
    working directory where the process is being implemented, using os.getcwd () 
    and the glob module, which finds all path names that resemble a specified pattern 
    according to the rules that are followed in a Unix terminal.
    
    Returns:
        Returns a list with all the files path of the given extension 
    """
    list_paths = []
    for path, subdir, files in os.walk(PATH):
        for file in glob(os.path.join(path, EXT)):
            list_paths.append(file)
    return list_paths


def concat_paths(all_paths):
    """receives a list of file directories with a CSV extension 
    and creates a dataset by concatenating each file
    and assigning each file a year label that is stored in the "Year" column,
    returning a dataframe

    Returns: 
        Returns a dataframe
    """

    all_df = []
    for path in all_paths:
        df = pd.read_csv(path, sep = ';')
        df['Year'] = int(path.split()[1])
        all_df.append(df)
    
    df = pd.concat(all_df, ignore_index=True)
    return df 

In [None]:
df = concat_paths(path_csvFiles())
df.head(5)

In [None]:
#Visualización de las columnas que representan vacios para eliminarlas
sns.heatmap(df.notnull())

In [None]:
# =============================================================================
# Plot rectangular df as a color-encoded matrix. 
# =============================================================================
sns.heatmap(df[(df['Type'] == "conference and proceedings")][["Type","Coverage","Publisher"]].notnull())

In [None]:
#Preprocessing data

#eliminar registros conference
df = df.loc[(df['Type'] == 'journal') | (df['Type'] == 'book series')]

In [None]:
#eliminar columnas
df.drop(list(df.filter(regex  = '20')), inplace = True, axis=1)


In [None]:
# =============================================================================
# Types columns
# =============================================================================
df.dtypes

In [None]:
for i in (5, 12, 13):
    print(f"\033[1m {df.columns[i]}:\n\033[0m {list((df[df.columns[i]]))[:30]}\n")
    

In [None]:
df['SJR'] = (df['SJR'].replace(',','.', regex=True).astype(float)).fillna(0)

In [None]:
df['Cites / Doc. (2years)'] = (df['Cites / Doc. (2years)'].replace(',','.', regex=True).astype(float))


In [None]:
df['Ref. / Doc.'] = (df['Ref. / Doc.'].replace(',','.', regex=True).astype(float))

In [None]:
sns.heatmap(df.notnull())

In [None]:
# =============================================================================
# Numeric columns
# =============================================================================
int_df = df.select_dtypes(include=['int64']).copy()
print(f"[{len(int_df)} rows x {len(int_df.columns)} columns]")

In [None]:
# =============================================================================
# summarize the central tendency, dispersion and shape of a dataset’s distribution.
# =============================================================================
int_df.describe()

In [None]:
# =============================================================================
# Categorical columns
# =============================================================================
obj_df = df.select_dtypes(include=['object']).copy()
print(f"[{len(obj_df)} rows x {len(obj_df.columns)} columns]")


In [None]:
obj_df.describe()

In [None]:
# =============================================================================
# Categorical columns sets
# =============================================================================
i=0
while i<len(obj_df.columns):    
    print(("\033[1m {}: \n \033[0m {}\n").format(obj_df.columns[i],list(set(obj_df[obj_df.columns[i]]))[:30]))
    i+=1

In [None]:
text = df['Title'].values 

print(len(set(text)))
wordcloud = WordCloud(width = 2000, height = 2000, 
            background_color = 'white', 
            stopwords = STOPWORDS, 
            min_font_size = 2,
            max_font_size = 100).generate(str(text)) 

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

print(STOPWORDS)