In [47]:
import pandas as pd

df = pd.read_csv('data/insa_all_cleaned.csv')

In [48]:
print(df.shape)      # number of rows, columns
print(df.head())     # first few rows
print(df.info())     # data types and non-null counts

(6194, 5)
  first_name last_name phone department     role
0  Alexandre      Saad   NaN      GMPPA  student
1      Clara      Saad   NaN       FIMI  student
2      Elise      Haag   NaN         GM  student
3      Linda      Taam   NaN         MT  student
4      Peter      Saad   NaN         IF  student
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  6194 non-null   object
 1   last_name   6194 non-null   object
 2   phone       875 non-null    object
 3   department  6194 non-null   object
 4   role        6194 non-null   object
dtypes: object(5)
memory usage: 242.1+ KB
None


In [4]:
df = df.drop_duplicates()

In [None]:
df = df.dropna(subset=['role'])

In [53]:
df.to_csv('data/insa_all_cleaned.csv', index=False)

In [None]:
students_df = df[df['role'].apply(lambda roles: 'student' in roles)]
print(students_df.shape)
print(students_df.head())
students_df.info()
students_df.to_csv('data/insa_students.csv', index=False)

(4794, 5)
  First Name Last Name Phone Department       Role
0  Alexandre      Saad          [GMPPA]  [student]
1      Clara      Saad           [FIMI]  [student]
2      Elise      Haag             [GM]  [student]
3      Linda      Taam             [MT]  [student]
4      Peter      Saad             [IF]  [student]
<class 'pandas.core.frame.DataFrame'>
Index: 4794 entries, 0 to 6506
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  4794 non-null   object
 1   Last Name   4794 non-null   object
 2   Phone       4794 non-null   object
 3   Department  4794 non-null   object
 4   Role        4794 non-null   object
dtypes: object(5)
memory usage: 224.7+ KB


In [None]:
unique_roles = set(role for roles in df['role'] for role in roles)
print(unique_roles)

{'researcher', 'emeritus', 'staff', 'teacher', 'alum', 'student', 'off'}


In [None]:
researchers_teachers_df = df[df['role'].apply(lambda roles: 'researcher' in roles or 'teacher' in roles)]
print(researchers_teachers_df.shape)
print(researchers_teachers_df.head())
researchers_teachers_df.info()
researchers_teachers_df.to_csv('data/insa_researchers_teachers.csv', index=False)

(1199, 5)
    First Name Last Name                                             Phone  \
12      Raouhi     Sanaa                                                []   
16        Amin    Laafar                                                []   
27       Amine   Laaribi                                                []   
28  Abderrahim   Maazouz  [04 72 43 63 32, 04 72 43 82 25, 04 72 43 78 10]   
30      Hacina   Djaafri                                                []   

      Department                   Role  
12         [LVA]     [alum, researcher]  
16  [FIMI, DEEP]        [alum, teacher]  
27      [LAMCOS]  [student, researcher]  
28     [GM, IMP]              [teacher]  
30      [AMPERE]              [teacher]  
<class 'pandas.core.frame.DataFrame'>
Index: 1199 entries, 12 to 6502
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  1199 non-null   object
 1   Last Name   1199 non-null   object
 2   Ph

In [49]:
df['phone'] = df['phone'].fillna('')
#    Or drop rows with missing Department or Role if those are required:
# df = df.dropna(subset=['Department', 'Role'])

df['role'] = df['role'].fillna('').apply(lambda s: [r.strip() for r in s.split(';') if r.strip()])
df['department'] = df['department'].fillna('').apply(lambda s: [d.strip() for d in s.split(';') if d.strip()])
df['phone'] = df['phone'].fillna('').apply(lambda s: [p.strip() for p in s.split(';') if p.strip()])
# 7. Verify the result.
print(df.shape)
print(df.head())

(6194, 5)
  first_name last_name phone department       role
0  Alexandre      Saad    []    [GMPPA]  [student]
1      Clara      Saad    []     [FIMI]  [student]
2      Elise      Haag    []       [GM]  [student]
3      Linda      Taam    []       [MT]  [student]
4      Peter      Saad    []       [IF]  [student]


In [54]:
import csv
import json
import ast

INPUT_CSV = 'data/insa_all_cleaned.csv'    # replace with your CSV filename
OUTPUT_JSON = 'data/insa_all_cleaned.json' # desired output filename

# Columns that contain Python-list strings
LIST_COLS = ['phone', 'department', 'role']

data = []
with open(INPUT_CSV, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # For each list-style column, literal_eval the cell (or use [] if empty)
        for col in LIST_COLS:
            raw = row.get(col, '').strip()
            if raw:
                try:
                    row[col] = ast.literal_eval(raw)
                except (ValueError, SyntaxError):
                    # Fallback to empty list if parsing fails
                    row[col] = []
            else:
                row[col] = []
        data.append(row)

# Write out as pretty-printed JSON
with open(OUTPUT_JSON, 'w', encoding='utf-8') as jsonfile:
    json.dump(data, jsonfile, ensure_ascii=False, indent=2)

print(f"Wrote {len(data)} records to {OUTPUT_JSON}")


Wrote 6194 records to data/insa_all_cleaned.json


In [52]:
df = df.replace('-', '', regex=False)