In [1]:
# Librairies
import pandas as pd
import re
from nltk.stem import PorterStemmer
from nltk import word_tokenize, bigrams, trigrams
from collections import Counter
import os
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')

# Data import

Your data should be downloaded from https://apps.webofknowledge.com search engine. The format should be a .csv file.

Copy it in the same folder of the notebook and input the name in the code below.


In [53]:
current = 'ILRA_EXAMPLE_Data_rjs_cs_cc.xls'  # your new dataset
compare = ['ILRA_EXAMPLE_Data_rotary_jet_spinning.xls']  # your previus dataset

current_data = pd.read_excel(current)  # import in dataframe
raw_data = pd.DataFrame()

if not compare:
    print ("No previous search")
    raw_data = current_data.copy(deep=True)
else:
    print ("Comparing files and removing duplicate from previous searches")
    for filename in compare:
        df_compare_1 = pd.read_excel(filename)
        raw_data = pd.concat([df_compare_1, current_data]).drop_duplicates(keep=False).copy(deep=True)

# raw_data.head  # you can uncomment this line to check that the file appears correctly

Comparing files and removing duplicate from previous searches


# Data cleaning

1. duplicate removal
2. remove empty columns (NaN)
3. text cleaning

In [3]:
# locate duplicates
raw_data['dup'] = raw_data.duplicated(subset=None, keep='first')
# Counting the number of duplicates
raw_data['dup'].value_counts()
# Creating a new dataframe without the duplicates
raw_data_noDup = raw_data[raw_data['dup'] == False]
# Deleting the column with the True and False because because it is no more useful
del raw_data_noDup['dup']

In [4]:
# Removing useless columns (All articles have written nothing in those fields)
raw_data_useField = raw_data_noDup.dropna(axis=1, how='all')

In [5]:
# Changing the type of NaN to string
# (For text cleaning everything need to be string)
raw_data_str = raw_data_useField.fillna("NaN")

In [68]:
# Text cleaning (removal of "meaningless" words)
"""
SmartStoplist.txt
by Lisa Andreevna
Lisanka93/text_analysis_python_101. (n.d.). GitHub. Retrieved May 3, 2021,
from https://github.com/lisanka93/text_analysis_python_101
**** Note : nan is added to Lisa Andreevna's list ****
"""
# Definition of constant and variable
stop_words_file = 'SmartStoplist.txt'
stop_words = []
# Creating a list of stop words while reading the stop words's file
with open(stop_words_file, "r") as f:
    for line in f:
        stop_words.extend(line.split())
# Do not understand yet
stop_words = stop_words
"""
Definition of a cleaning function (preprocess before words analysis)
This function get a text and return a text (string) of stemmed word in
lowercase without stop words and any caracter except letter
"""
def preprocess(raw_text):
    """
    Keep only letters in the text (lowercase and capitals) using Regex (re).
    Replace all symboles with a blank space.
    """
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    # Change the capitals for lowercase AND split into a list of words (no expression)
    words = letters_only_text.lower().split()
    # Define a variable to receive only the useful crop (or not) words
    cleaned_words = []
    # Remove stop words (Take word in list of words and make a list of clean words)
    for word in words:
        if word not in stop_words:
            cleaned_words.append(word)
    # Stem word (Creating a new list of stemmed word with the clean one)
    stemmed_words = []
    for word in cleaned_words:
        word = PorterStemmer().stem(word)
        stemmed_words.append(word)
    # After all those changes, convert back the final list into string
    return " ".join(stemmed_words)


In [None]:
# Clean abstracts of all the articles of the research (overwrite)
raw_data_str['Abstract'] = raw_data_str['Abstract'].apply(preprocess)
raw_data_str['Article Title'] = raw_data_str['Article Title'].apply(preprocess)
raw_data_str['Author Keywords'] = raw_data_str['Author Keywords'].apply(preprocess)
clean_data = raw_data_str

# Data exploration

1. Most common occurences
2. Find biagrams and trigrams 



In [61]:
# word count

def word_count(category, clean_data):
    # Most common words in all the abstracts (top 100)
    top_hundred = Counter(" ".join(clean_data[category]).split()).most_common(100)

    # Occurence of all the clean words (approximate number by trial and error)
    clean_words_occ = Counter(" ".join(clean_data[category]).split()).most_common(2900)

    # Most common bigrams and trigrams in clean data
    # Puting all abstracts into a list
    all_abstracts_list = clean_data[category].tolist()
    # Defining variables
    all_abstracts_bigrams = []
    all_abstracts_trigrams = []

    # Creating list of bigrams and trigrams by abstracts, i.e. list[0]=allBigramOfAbs1
    for abstracts in all_abstracts_list:
        abstracts = word_tokenize(abstracts)
        all_abstracts_bigrams.append(list(bigrams(abstracts)))
        all_abstracts_trigrams.append(list(trigrams(abstracts)))

    # Obtaining the most commons ones by abstracts for all of them
    top3_bi = []
    for bi_by_abst in all_abstracts_bigrams:
        top3_bi_by_abst = Counter(bi_by_abst).most_common(3)
        top3_bi.append(top3_bi_by_abst)
    top3_tri = []
    for tri_by_abst in all_abstracts_trigrams:
        top3_tri_by_abst = Counter(tri_by_abst).most_common(3)
        top3_tri.append(top3_tri_by_abst)

    return top_hundred, clean_words_occ, top3_bi, top3_tri

## Abstracts

In [63]:
category = 'Abstract'

top_hundred, clean_word_occ, top3_bi, top3_tri = word_count(category, clean_data)
# uncomment to get a look
#top_hundred
#(top3_bi)
#(top3_tri)

## Analysis in the titles

In [69]:
category = 'Article Title'

#top_hundred, clean_word_occ, top3_bi, top3_tri = word_count(category, clean_data)
# uncomment to get a look
#top_hundred
#(top3_bi)
#(top3_tri)

[('rotari', 29),
 ('jet', 29),
 ('spin', 26),
 ('scaffold', 15),
 ('nanofib', 14),
 ('fiber', 12),
 ('tissu', 9),
 ('polym', 7),
 ('produc', 7),
 ('spun', 6),
 ('product', 6),
 ('electrospin', 6),
 ('engin', 5),
 ('high', 5),
 ('fibrou', 5),
 ('applic', 5),
 ('poli', 4),
 ('polycaprolacton', 4),
 ('fabric', 4),
 ('nanofibr', 4),
 ('centrifug', 3),
 ('obtain', 3),
 ('potenti', 3),
 ('biomimet', 3),
 ('repair', 3),
 ('low', 3),
 ('hybrid', 3),
 ('model', 3),
 ('porou', 3),
 ('protein', 3),
 ('multi', 3),
 ('nanotub', 3),
 ('heart', 3),
 ('oxid', 2),
 ('beta', 2),
 ('vitro', 2),
 ('vivo', 2),
 ('evalu', 2),
 ('effect', 2),
 ('composit', 2),
 ('solvent', 2),
 ('techniqu', 2),
 ('base', 2),
 ('scale', 2),
 ('deliveri', 2),
 ('estrogen', 2),
 ('promot', 2),
 ('skin', 2),
 ('nano', 2),
 ('hydroxyapatit', 2),
 ('influenc', 2),
 ('polyurethan', 2),
 ('gelatin', 2),
 ('comparison', 2),
 ('pcl', 2),
 ('acid', 2),
 ('extracellular', 2),
 ('matrix', 2),
 ('membran', 2),
 ('plla', 2),
 ('system', 2)

## Analysis in the Author's keywords

In [66]:
category = 'Author Keywords'

#top_hundred, clean_word_occ, top3_bi, top3_tri = word_count(category, clean_data)
# uncomment to get a look
#top_hundred
#(top3_bi)
#(top3_tri)

[('spinning;', 27),
 ('jet', 18),
 ('Rotary', 13),
 ('rotary', 10),
 ('NaN', 10),
 ('centrifugal', 5),
 ('tissue', 5),
 ('engineering', 5),
 ('Tissue', 5),
 ('engineering;', 5),
 ('spinning', 4),
 ('Nanofiber;', 4),
 ('Electrospinning;', 4),
 ('nanofibers;', 4),
 ('electrospinning;', 3),
 ('healing;', 3),
 ('fibers;', 3),
 ('scaffold;', 3),
 ('Centrifugal', 3),
 ('carbon', 3),
 ('Nanofiber', 3),
 ('fiber', 2),
 ('rheology;', 2),
 ('materials;', 2),
 ('Bone', 2),
 ('jet-spinning;', 2),
 ('Fibers;', 2),
 ('of', 2),
 ('biomaterials;', 2),
 ('fiber;', 2),
 ('polyurethane;', 2),
 ('scaffolds', 2),
 ('mechanical', 2),
 ('Wound', 2),
 ('immersion', 2),
 ('nanofiber;', 2),
 ('composites;', 2),
 ('Rapid', 2),
 ('acid);', 2),
 ('Biomaterial;', 2),
 ('Polymer', 2),
 ('nanotubes;', 2),
 ('Spinning;', 2),
 ('applications;', 2),
 ('valve;', 2),
 ('Jet', 2),
 ('entangled', 1),
 ('polymers;', 1),
 ('nonwovens;', 1),
 ('Biocompatible', 1),
 ('transplantation;', 1),
 ('Regeneration', 1),
 ('Pullulan;', 

# Analysis in the Visualization

# Data save and export

In [25]:
# If there is no folder for the result create one
os.makedirs('Results', exist_ok=True)

# Word count data
clean_words_occ_df = pd.DataFrame(clean_words_occ, columns=['Word', 'Count'])
clean_words_occ_df.to_csv('./Results/ILRA_CleanWordsOccurence.csv', sep=';')

# Bigrams and trigrams TO CORRECT
abstract_grams_df = pd.DataFrame([all_abstracts_bigrams, all_abstracts_trigrams])
abstract_grams_df.to_csv('./Results/ILRA_abstract_grams.csv', sep=';')

In [65]:
clean_data.columns


Index(['Publication Type', 'Authors', 'Book Editors', 'Book Group Authors',
       'Author Full Names', 'Article Title', 'Source Title',
       'Book Series Title', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses',
       'Reprint Addresses', 'Email Addresses', 'Researcher Ids', 'ORCIDs',
       'Funding Orgs', 'Funding Text', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Volume', 'Issue', 'Supplement', 'Meeting Abstract',
       'Start Page', 'End Page', 'Article Number', 'DOI', 'Early Access Date',
       'Number of Pages', 'WoS Categories', 'Research Areas', 'IDS N