From 0028e9c09395f9f3617884c6f8c301201b95e693 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 9 Jul 2020 08:07:47 +0530 Subject: [PATCH] Added trick to Split (explode) pandas dataframe string entry to separate rows --- .../str.split()-checkpoint.ipynb | 351 ++++++++++++++++++ Code/str.split().ipynb | 179 ++++++++- 2 files changed, 526 insertions(+), 4 deletions(-) create mode 100644 Code/.ipynb_checkpoints/str.split()-checkpoint.ipynb diff --git a/Code/.ipynb_checkpoints/str.split()-checkpoint.ipynb b/Code/.ipynb_checkpoints/str.split()-checkpoint.ipynb new file mode 100644 index 0000000..bd97f05 --- /dev/null +++ b/Code/.ipynb_checkpoints/str.split()-checkpoint.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import necessary libraries\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name
0Elise Mccann
1Aiden Berger
2Elle Kelley
\n", + "
" + ], + "text/plain": [ + " name\n", + "0 Elise Mccann\n", + "1 Aiden Berger\n", + "2 Elle Kelley" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a dataframe\n", + "df = pd.DataFrame({'name':['Elise Mccann', 'Aiden Berger', 'Elle Kelley']})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namefirst_namelast_name
0Elise MccannEliseMccann
1Aiden BergerAidenBerger
2Elle KelleyElleKelley
\n", + "
" + ], + "text/plain": [ + " name first_name last_name\n", + "0 Elise Mccann Elise Mccann\n", + "1 Aiden Berger Aiden Berger\n", + "2 Elle Kelley Elle Kelley" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extract first name and last name\n", + "df['first_name'] = df['name'].str.split(' ', expand = True)[0]\n", + "df['last_name'] = df['name'].str.split(' ', expand = True)[1]\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split (explode) pandas dataframe string entry to separate rows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameParticipation in
0Elise MccannChess,Football
1Aiden BergerCricket
2Elle KelleyFoosball,Carrom
\n", + "
" + ], + "text/plain": [ + " name Participation in\n", + "0 Elise Mccann Chess,Football\n", + "1 Aiden Berger Cricket\n", + "2 Elle Kelley Foosball,Carrom" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dx = pd.DataFrame({'name':['Elise Mccann', 'Aiden Berger', 'Elle Kelley'],\n", + " 'Participation in': ['Chess,Football','Cricket','Foosball,Carrom']})\n", + "dx" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def comma_separator(df, col):\n", + " constraints=df[col].apply(lambda x: str(x).split(',')).tolist()\n", + " df_new=pd.DataFrame(constraints, index=df['name'])\n", + " df_new=df_new.apply(pd.Series).stack()\n", + " df_new=pd.DataFrame(df_new)\n", + " df_new.reset_index(inplace=True)\n", + " df_new=df_new[['name',0]]\n", + " df_new.columns=['name','Participated in'] \n", + " df_new=df_new[~(df_new['Participated in']=='None')]\n", + " return df_new" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameParticipated in
0Elise MccannChess
1Elise MccannFootball
2Aiden BergerCricket
3Elle KelleyFoosball
4Elle KelleyCarrom
\n", + "
" + ], + "text/plain": [ + " name Participated in\n", + "0 Elise Mccann Chess\n", + "1 Elise Mccann Football\n", + "2 Aiden Berger Cricket\n", + "3 Elle Kelley Foosball\n", + "4 Elle Kelley Carrom" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comma_separator(dx,'Participation in')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Code/str.split().ipynb b/Code/str.split().ipynb index 455000b..bd97f05 100644 --- a/Code/str.split().ipynb +++ b/Code/str.split().ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -63,7 +63,7 @@ "2 Elle Kelley" ] }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -148,6 +148,177 @@ "df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split (explode) pandas dataframe string entry to separate rows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameParticipation in
0Elise MccannChess,Football
1Aiden BergerCricket
2Elle KelleyFoosball,Carrom
\n", + "
" + ], + "text/plain": [ + " name Participation in\n", + "0 Elise Mccann Chess,Football\n", + "1 Aiden Berger Cricket\n", + "2 Elle Kelley Foosball,Carrom" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dx = pd.DataFrame({'name':['Elise Mccann', 'Aiden Berger', 'Elle Kelley'],\n", + " 'Participation in': ['Chess,Football','Cricket','Foosball,Carrom']})\n", + "dx" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def comma_separator(df, col):\n", + " constraints=df[col].apply(lambda x: str(x).split(',')).tolist()\n", + " df_new=pd.DataFrame(constraints, index=df['name'])\n", + " df_new=df_new.apply(pd.Series).stack()\n", + " df_new=pd.DataFrame(df_new)\n", + " df_new.reset_index(inplace=True)\n", + " df_new=df_new[['name',0]]\n", + " df_new.columns=['name','Participated in'] \n", + " df_new=df_new[~(df_new['Participated in']=='None')]\n", + " return df_new" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameParticipated in
0Elise MccannChess
1Elise MccannFootball
2Aiden BergerCricket
3Elle KelleyFoosball
4Elle KelleyCarrom
\n", + "
" + ], + "text/plain": [ + " name Participated in\n", + "0 Elise Mccann Chess\n", + "1 Elise Mccann Football\n", + "2 Aiden Berger Cricket\n", + "3 Elle Kelley Foosball\n", + "4 Elle Kelley Carrom" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comma_separator(dx,'Participation in')" + ] + }, { "cell_type": "code", "execution_count": null, @@ -172,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.7.3" } }, "nbformat": 4,